Add Tax Programs Data

This tutorial shows how the team added tax programs data to run descriptive statistics and conduct predictive analysis.

# load necessary packages ----

library( here )
library( data.table )
library( dplyr )
library( pander )
library( tidyverse )
library( readr )
library( forecast )
library( knitr )
library( stargazer )
library( scales )
library(import)

import::here("S_TYPE",
             "d",
             "df",
             "d3",
             "d6",
             "PLOTS",
             "%>%",
             # notice the use of here::here() that points to the .R file
             # where all these R objects are created
             .from = here::here("labs/wk06/unified_team3_source.R"),
             .character_only = TRUE)

detach("package:import", unload = TRUE)

NMTC and LIHTC variables

The data set below shows variables in NMTC and LIHTC.

d %>% head()

Below are list of all column names in the NMTC and LIHTC data set use for analysis.

colnames(d)

##  [1] "tractid"             "mhmval00"            "mhmval12"           
##  [4] "hinc00"              "hu00"                "vac00"              
##  [7] "own00"               "rent00"              "h30old00"           
## [10] "empclf00"            "clf00"               "unemp00"            
## [13] "prof00"              "dpov00"              "npov00"             
## [16] "ag25up00"            "hs00"                "col00"              
## [19] "pop00.x"             "nhwht00"             "nhblk00"            
## [22] "hisp00"              "asian00"             "cbsa"               
## [25] "cbsaname"            "p.white"             "p.black"            
## [28] "p.hisp"              "p.asian"             "p.hs"               
## [31] "p.col"               "p.prof"              "p.unemp"            
## [34] "p.vacant"            "mhv.change.00.to.10" "p.mhv.change"       
## [37] "pov.rate"            "mhv.00"              "mhv.10"             
## [40] "mhv.change"          "mhv.growth"

Data set of change variables

The change variable are the new data set that will be use for analysis.

Data for diff-indiff model

d3 %>% head()

saveRDS(d, here("data/rodeo-data-wk.cs.rds"))

Load all rodeo data set

readRDS(file = here("data/rodeo-data-wk.cs.rds"))

Data maniest

This help to explain how data set use changes for the period under review

ltdb.row.2000s <- read.csv(here("data/raw/ltdb_std_2000_sample.csv"))
ltdb.row.2000f <- read.csv(here("data/raw/ltdb_std_2000_fullcount.csv"))
ltdb.rodeo.2000 <- readRDS(here("data/rodeo/ltdb-2000.rds"))

ltdb.row.2010s <- read.csv(here("data/raw/ltdb_std_2010_sample.csv"))
ltdb.row.2010f <- read.csv(here("data/raw/ltdb_std_2000_fullcount.csv"))
ltdb.rodeo.2010 <- readRDS(here("data/rodeo/ltdb-2010.rds"))

complete.rodeo <- readRDS(file = here("data/rodeo-data-wk.cs.rds"))

However, the sample data set of 2000 has 72693 rows which is the same row for the full data set. Which shows that there is different between the sample data set and the full data set.

In the case of the 2010 data set, the sample data set have 73056 while the full data set have 72693 which means there are more treats in the 2010 full data set as compared to the sample data set for 2010

Also the rodeo data set for 2000 has 72693 rows which shows that all the data set in the full data set are included in the sample data set. Moreover, the row of rodeo data set for 2010 is 74022 which shows that the rows in the full data set is different from the sample data set. Because the marging in the 2o10 data set the number of rows increases in the rodeo data set.

nrow(ltdb.row.2000s)

## [1] 72693

nrow(ltdb.row.2000f)

## [1] 72693

nrow(ltdb.rodeo.2000)

## [1] 72693

nrow(ltdb.row.2010s)

## [1] 73056

However, from 2000 data set above, it is observed that all the count are the same which means there is no changes in the data set.

nrow(ltdb.row.2010f)

## [1] 72693

nrow(ltdb.rodeo.2010)

## [1] 74022

The completed data set is different from the rodeo data sets for 2000 and 2010. The different is as result of the many steps taken during the analyssis. During the analysis we filtered out row tracts for rural and maintain urban tract. Also all the row tracts for Median Home Value for 2000 that are less than 10000. The series of adjustment that took place reduces the number rows for completed rodeo data set as compare to the rodeo data set for 2000 and 2010.

nrow(complete.rodeo)

## [1] 59722

PLOTS

## $pov_rate_2000
## $pov_rate_2000$nmtc

## 
## $pov_rate_2000$lihtc

## 
## 
## $mhv_2000
## $mhv_2000$nmtc

## 
## $mhv_2000$lihtc

## 
## 
## $mhv_growth
## $mhv_growth$nmtc

## 
## $mhv_growth$lihtc

Median Home value at baseline

hist( df$MedianHomeValue2000, breaks=200, xlim=c(0,500000), 
      col="gray20", border="white",
      axes=F, 
      xlab="MHV (median = $138k)",
      ylab="",
      main="Median Home Value in 2000 (2010 US dollars)" )
axis( side=1, at=seq(0,500000,100000), 
      labels=c("$0","$100k","$200k","$300k","$400k","$500k") )
abline( v=median( df$MedianHomeValue2000, na.rm=T ), col="orange", lwd=3 )

stargazer( df, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue2000	11,167	105,661	154,903	187,129	224,337	1,288,551
MedianHomeValue2010	9,999	123,200	193,200	246,570	312,000	1,000,001
MHV.Change.00.to.10	-1,228,651	7,187	36,268	60,047	94,881	1,000,001
MHV.Growth.00.to.12	-97	6	25	33	50	6,059

hg <-
hist( df$MHV.Growth.00.to.12, breaks=5000, 
      xlim=c(-100,200), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray40", border="white" )
axis( side=1, at=seq( from=-100, to=200, by=50 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
        
mean.x <- mean( df$MHV.Growth.00.to.12, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )
median.x <- median( df$MHV.Growth.00.to.12, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Below is the difference-in-difference data framework used for the analysis of NMTC

d3 %>% head()

Below is the impact of NMTC on tracts involved in the treatment group

m <- lm( y~treat+ post + treat*post, data=d3 )

stargazer::stargazer(m, type = "html", digits = 2)


	Dependent variable:

	y

treat	-0.26^***
	(0.02)

post	0.23^***
	(0.004)

treat:post	0.10^***
	(0.02)

Constant	11.96^***
	(0.003)


Observations	118,132
R²	0.04
Adjusted R²	0.04
Residual Std. Error	0.64 (df = 118128)
F Statistic	1,432.34^*** (df = 3; 118128)

Note:	p<0.1; p<0.05; p<0.01

From the model above, the post variable indicate a 23% growth across all the tracts regardless whether they participated in te NMTC program. However, the tracts that participated in the NMTC program have 10% growth

Below is the difference-in-difference data framework used for the analysis of LIHTC

d6 %>% head()

Below is the impact of NMTC on tracts involved in the treatment group

m.lihtc <- lm( y~treat + post + treat*post, data=d6 )

stargazer::stargazer(m.lihtc, type = "html", digits = 2)


	Dependent variable:

	y

treat	-0.21^***
	(0.01)

post	0.23^***
	(0.004)

treat:post	0.01
	(0.01)

Constant	11.98^***
	(0.003)


Observations	118,132
R²	0.04
Adjusted R²	0.04
Residual Std. Error	0.64 (df = 118128)
F Statistic	1,799.55^*** (df = 3; 118128)

Note:	p<0.1; p<0.05; p<0.01

The post variable indicate a 23% growth across all the tracts regardless whether they participated in the LIHTC program. However, the tracts that participated in the LIHTC program have 0.01% growth which have no statistical significant.

As compare to the NMTC, it is observed that tracts that participated in the NMTC got more growth and statistically significant.

Moreover, the R2 and the standard error for tracts that participated in the NMTC and LIHTC is the same.

Household Income 2000

From the analysis, tract that received LUHTC porgram have

### HOUSEHOLD INCOME COMPARISONIN 2000:
### PROGRAM RECIPIENTS VS NON-RECIPIENT TRACTS
# Tracts that received LIHTC
mean( d$hinc00[ d$num.lihtc > 0 ] )

## [1] NaN

The tract that do not received the program hand

# Tracts that did not 
mean( d$hinc00[ d$num.lihtc == 0 ] )

## [1] NaN

When comparing recipient tracts to non-recipient tracts it is clearly that they are different. The NMTC program, seem to have more effec as compared to LIHTC.