This tutorial shows how the team added tax programs data to run descriptive statistics and conduct predictive analysis.

# load necessary packages ----

library( here )
library( data.table )
library( dplyr )
library( pander )
library( tidyverse )
library( readr )
library( forecast )
library( knitr )
library( stargazer )
library( scales )
library(import)
import::here("S_TYPE",
             "d",
             "df",
             "d3",
             "d6",
             "PLOTS",
             "%>%",
             # notice the use of here::here() that points to the .R file
             # where all these R objects are created
             .from = here::here("labs/wk06/unified_team3_source.R"),
             .character_only = TRUE)

detach("package:import", unload = TRUE)

NMTC and LIHTC variables

The data set below shows variables in NMTC and LIHTC.

d %>% head()

Below are list of all column names in the NMTC and LIHTC data set use for analysis.

colnames(d)
##  [1] "tractid"             "mhmval00"            "mhmval12"           
##  [4] "hinc00"              "hu00"                "vac00"              
##  [7] "own00"               "rent00"              "h30old00"           
## [10] "empclf00"            "clf00"               "unemp00"            
## [13] "prof00"              "dpov00"              "npov00"             
## [16] "ag25up00"            "hs00"                "col00"              
## [19] "pop00.x"             "nhwht00"             "nhblk00"            
## [22] "hisp00"              "asian00"             "cbsa"               
## [25] "cbsaname"            "p.white"             "p.black"            
## [28] "p.hisp"              "p.asian"             "p.hs"               
## [31] "p.col"               "p.prof"              "p.unemp"            
## [34] "p.vacant"            "mhv.change.00.to.10" "p.mhv.change"       
## [37] "pov.rate"            "mhv.00"              "mhv.10"             
## [40] "mhv.change"          "mhv.growth"

Data set of change variables

The change variable are the new data set that will be use for analysis.

Data for diff-indiff model

d3 %>% head()
saveRDS(d, here("data/rodeo-data-wk.cs.rds"))

Load all rodeo data set

readRDS(file = here("data/rodeo-data-wk.cs.rds"))


Data maniest

This help to explain how data set use changes for the period under review

ltdb.row.2000s <- read.csv(here("data/raw/ltdb_std_2000_sample.csv"))
ltdb.row.2000f <- read.csv(here("data/raw/ltdb_std_2000_fullcount.csv"))
ltdb.rodeo.2000 <- readRDS(here("data/rodeo/ltdb-2000.rds"))

ltdb.row.2010s <- read.csv(here("data/raw/ltdb_std_2010_sample.csv"))
ltdb.row.2010f <- read.csv(here("data/raw/ltdb_std_2000_fullcount.csv"))
ltdb.rodeo.2010 <- readRDS(here("data/rodeo/ltdb-2010.rds"))

complete.rodeo <- readRDS(file = here("data/rodeo-data-wk.cs.rds"))

However, the sample data set of 2000 has 72693 rows which is the same row for the full data set. Which shows that there is different between the sample data set and the full data set.

In the case of the 2010 data set, the sample data set have 73056 while the full data set have 72693 which means there are more treats in the 2010 full data set as compared to the sample data set for 2010

Also the rodeo data set for 2000 has 72693 rows which shows that all the data set in the full data set are included in the sample data set. Moreover, the row of rodeo data set for 2010 is 74022 which shows that the rows in the full data set is different from the sample data set. Because the marging in the 2o10 data set the number of rows increases in the rodeo data set.

nrow(ltdb.row.2000s)
## [1] 72693
nrow(ltdb.row.2000f)
## [1] 72693
nrow(ltdb.rodeo.2000)
## [1] 72693
nrow(ltdb.row.2010s)
## [1] 73056

However, from 2000 data set above, it is observed that all the count are the same which means there is no changes in the data set.

nrow(ltdb.row.2010f)
## [1] 72693
nrow(ltdb.rodeo.2010)
## [1] 74022

The completed data set is different from the rodeo data sets for 2000 and 2010. The different is as result of the many steps taken during the analyssis. During the analysis we filtered out row tracts for rural and maintain urban tract. Also all the row tracts for Median Home Value for 2000 that are less than 10000. The series of adjustment that took place reduces the number rows for completed rodeo data set as compare to the rodeo data set for 2000 and 2010.

nrow(complete.rodeo)
## [1] 59722
PLOTS
## $pov_rate_2000
## $pov_rate_2000$nmtc

## 
## $pov_rate_2000$lihtc

## 
## 
## $mhv_2000
## $mhv_2000$nmtc

## 
## $mhv_2000$lihtc

## 
## 
## $mhv_growth
## $mhv_growth$nmtc

## 
## $mhv_growth$lihtc


Median Home value at baseline

hist( df$MedianHomeValue2000, breaks=200, xlim=c(0,500000), 
      col="gray20", border="white",
      axes=F, 
      xlab="MHV (median = $138k)",
      ylab="",
      main="Median Home Value in 2000 (2010 US dollars)" )
axis( side=1, at=seq(0,500000,100000), 
      labels=c("$0","$100k","$200k","$300k","$400k","$500k") )
abline( v=median( df$MedianHomeValue2000, na.rm=T ), col="orange", lwd=3 )


stargazer( df, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
MedianHomeValue2000 11,167 105,661 154,903 187,129 224,337 1,288,551
MedianHomeValue2010 9,999 123,200 193,200 246,570 312,000 1,000,001
MHV.Change.00.to.10 -1,228,651 7,187 36,268 60,047 94,881 1,000,001
MHV.Growth.00.to.12 -97 6 25 33 50 6,059


hg <-
hist( df$MHV.Growth.00.to.12, breaks=5000, 
      xlim=c(-100,200), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray40", border="white" )
axis( side=1, at=seq( from=-100, to=200, by=50 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
        
mean.x <- mean( df$MHV.Growth.00.to.12, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )
median.x <- median( df$MHV.Growth.00.to.12, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )


Below is the difference-in-difference data framework used for the analysis of NMTC

d3 %>% head()


Below is the impact of NMTC on tracts involved in the treatment group

m <- lm( y~treat+ post + treat*post, data=d3 )

stargazer::stargazer(m, type = "html", digits = 2)
Dependent variable:
y
treat -0.26***
(0.02)
post 0.23***
(0.004)
treat:post 0.10***
(0.02)
Constant 11.96***
(0.003)
Observations 118,132
R2 0.04
Adjusted R2 0.04
Residual Std. Error 0.64 (df = 118128)
F Statistic 1,432.34*** (df = 3; 118128)
Note: p<0.1; p<0.05; p<0.01

From the model above, the post variable indicate a 23% growth across all the tracts regardless whether they participated in te NMTC program. However, the tracts that participated in the NMTC program have 10% growth

Below is the difference-in-difference data framework used for the analysis of LIHTC

d6 %>% head()


Below is the impact of NMTC on tracts involved in the treatment group

m.lihtc <- lm( y~treat + post + treat*post, data=d6 )

stargazer::stargazer(m.lihtc, type = "html", digits = 2)
Dependent variable:
y
treat -0.21***
(0.01)
post 0.23***
(0.004)
treat:post 0.01
(0.01)
Constant 11.98***
(0.003)
Observations 118,132
R2 0.04
Adjusted R2 0.04
Residual Std. Error 0.64 (df = 118128)
F Statistic 1,799.55*** (df = 3; 118128)
Note: p<0.1; p<0.05; p<0.01

The post variable indicate a 23% growth across all the tracts regardless whether they participated in the LIHTC program. However, the tracts that participated in the LIHTC program have 0.01% growth which have no statistical significant.

As compare to the NMTC, it is observed that tracts that participated in the NMTC got more growth and statistically significant.

Moreover, the R2 and the standard error for tracts that participated in the NMTC and LIHTC is the same.

Household Income 2000

From the analysis, tract that received LUHTC porgram have

### HOUSEHOLD INCOME COMPARISONIN 2000:
### PROGRAM RECIPIENTS VS NON-RECIPIENT TRACTS
# Tracts that received LIHTC
mean( d$hinc00[ d$num.lihtc > 0 ] )
## [1] NaN

The tract that do not received the program hand

# Tracts that did not 
mean( d$hinc00[ d$num.lihtc == 0 ] )
## [1] NaN

When comparing recipient tracts to non-recipient tracts it is clearly that they are different. The NMTC program, seem to have more effec as compared to LIHTC.