Descriptive Analysis of Community Change

April 19, 2021

Introduction

This is a tutorial that shows you some common preprocessing steps you will need to do in this week’s lab.

Please use the `import::here` function

This tutorial does not use import::here() but your submission for week 03 requires its usage. The code is kept here in order to help you understand why certain decisions were made.

If you need to see how to use the import::here() function, please see the “Project Data Steps” tutorial.

library( dplyr )
library( here )
library( knitr )
library( pander )
library( stargazer )
library( scales )

# set stargazer type to text for 
# previewing in RMD docs but
# convert to type HTML when knitting
# (next code chunk)

s.type <- "text"

###################################
#
#     STARGAZER SETTINGS
#
###################################

# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs

s.type <- "html"

For this part of the project you will:

Calculate change in the MHV variable between 2000 and 2010.
Measure gentrification that occurs between 2000 and 2010.
Pick a city and create a new dorling cartogram to visualize your data.
Prepare descriptive statistics and chloropleth maps describing the MHV change variable and gentrification.

This tutorial demonstrates some of the initial steps of variable creation and data exploration for the project.

Data

See the data steps for the wrangling that occurs during the process of creating our rodeo datasets.

Reminder: stop using static file paths and start using there `here::here()` function

I cannot stress this enough: all data files must be imported using relative paths. You cannot use a static file (i.e. /Users/myname/some/file/path) because it breaks portability and therefore makes it impossible for anyone to reproduce your analysis.

Reason being is that no one else can rerun your file because that static file path belongs to you and only you.For a refresher on why you cannot use static file paths and need to use the here::here() function, please see Week 02’s lecture videos.

Before moving on, be sure to run the Data Steps tutorial

The following chunks depend on you having clean data in your data/rodeo folder. These files are generated by following along the Data Steps Tutorial tutorial.

To re-run the tutorial locally, follow these steps:

Download a local copy of the labs/utilities.R file;
Save the file as labs/wk03/utilities.R.
Download a local copy the labs/project_data_steps.R file;
Save the file as labs/wk03/project_data_steps.R.
Select all the lines within the labs/wk03/project_data_steps.R and run them all to produce all of the clean data files within data/rodeo.

# note: please do not use static file paths
# note: notice down below the use of here::here()
d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure we are not losing 
# or gaining observations in the merge
nrow( d1 )

## [1] 72693

d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )

## [1] 72693

Filter Rural Districts

table( d$urban )

## 
## rural urban 
## 12971 59722

d <- filter( d, urban == "urban" )

Identify Common Variables

We can create a function to compare variables from the 2000 and 2010 datasets:

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )

## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag18cv"  "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv" 
##  [22] "agewht"  "asian"   "china"   "clf"     "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dmulti"  "dnapov"  "dpov"   
##  [36] "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"   "flabf"  
##  [43] "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"      "hha"    
##  [50] "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"   "hinch"  
##  [57] "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"   "iranc"  
##  [64] "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"     "manuf"  
##  [71] "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm"  "n65pov" 
##  [78] "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"   "nhwht"  
##  [85] "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"   "own"    
##  [92] "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"    "scanc"  
##  [99] "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"     "viet"   
## [106] "wds"    
## [1] "NOT SHARED:"
##  [1] "a65asn"  "a65blk"  "a65hsp"  "a65ntv"  "a65wht"  "cni16u"  "dis"    
##  [8] "hu00sp"  "ohu00sp" "p10imm"  "p10yrs"  "p15asn"  "p15blk"  "p15hsp" 
## [15] "p15ntv"  "p15wht"  "p18und"  "p30old"  "p60up"   "p65asn"  "p65blk" 
## [22] "p65hsp"  "p65ntv"  "p65pov"  "p65wht"  "p75up"   "papov"   "pasian" 
## [29] "pbpov"   "pchina"  "pcol"    "pcuban"  "pfb"     "pfhh"    "pfilip" 
## [36] "pflabf"  "pfmpov"  "pgeanc"  "pgefb"   "phaw"    "phisp"   "phpov"  
## [43] "phs"     "pindia"  "piranc"  "pirfb"   "pitanc"  "pitfb"   "pjapan" 
## [50] "pkorea"  "plep"    "pmanuf"  "pmar"    "pmex"    "pmulti"  "pnapov" 
## [57] "pnat"    "pnhblk"  "pnhwht"  "pntv"    "polang"  "pown"    "ppov"   
## [64] "ppr"     "pprof"   "pruanc"  "prufb"   "pscanc"  "pscfb"   "psemp"  
## [71] "punemp"  "pvac"    "pvet"    "pviet"   "pwds"    "pwpov"

head( vars )

Create Dataset for Analysis

Create subset for the analysis

d.full <- d  # keep a copy so don't have to reload

d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval00, mhmval12, hinc00, 
             hu00, own00, rent00,  
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht00 / pop00.x,
          p.black = 100 * nhblk00 / pop00.x,
          p.hisp = 100 * hisp00 / pop00.x, 
          p.asian = 100 * asian00 / pop00.x,
          p.hs = 100 * (hs00+col00) / ag25up00,
          p.col = 100 * col00 / ag25up00,
          p.prof = 100 * prof00 / empclf00,
          p.unemp = 100 * unemp00 / clf00,
          pov.rate = 100 * npov00 / dpov00 )

If you call the stargazer() function with a linear model object (a regression model) it will create a nicely-formatted regression table for you.

If you instead use a data frame object it will create a table of descriptive statistics.

You can set the statistics you would like reported in the table with the summary.stat= argument.

stargazer( d, 
           type=s.type, 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

mhmval00	0	81,600	119,900	144,738	173,894	1,000,001
mhmval12	9,999	123,200	193,200	246,570	312,000	1,000,001
hinc00	2,499	33,000	43,825	47,657	58,036	200,001
hu00	0	1,102	1,519	1,570	1,999	11,522
own00	0	542	902	939	1,289	4,911
rent00	0	195	398	516	712	8,544
empclf00	0	1,205	1,756	1,820	2,373	10,334
clf00	0	1,302	1,865	1,930	2,502	11,251
unemp00	0	51	87	110	140	6,405
prof00	0	299	539	637	873	6,610
dpov00	0	2,671	3,718	3,804	4,871	23,892
npov00	0	149	304	452	601	5,515
ag25up00	0	1,763	2,451	2,520	3,224	17,974
hs00	0	665	1,071	1,155	1,552	8,909
col00	0	243	492	665	923	9,313
pop00.x	0	2,751	3,802	3,901	4,976	36,206
nhwht00	0	1,308	2,514	2,591	3,713	20,619
nhblk00	0	41	141	522	527	14,039
hisp00	0	55	153	547	533	13,391
asian00	0	22	65	189	183	9,491
p.white	0	47	78	67	91	100
p.black	0	1	4	14	14	100
p.hisp	0	2	4	13	15	100
p.asian	0	1	2	5	5	95
p.hs	0	67	72	72	77	100
p.col	0	12	21	26	36	100
p.prof	0	23	31	34	43	100
p.unemp	0	3	5	6	8	100
pov.rate	0	4	9	12	17	100

Exploration of Median Home Value

Initial conditions in 2000:

# adjust 2000 home values for inflation 
mhv.00 <- d$mhmval00 * 1.28855  
mhv.10 <- d$mhmval12

mhv.change <- mhv.10 - mhv.00

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  Change.00.to.10=mhv.change )

stargazer( df, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue2000	0	105,146	154,497	186,502	224,071	1,288,551
MedianHomeValue2010	9,999	123,200	193,200	246,570	312,000	1,000,001
Change.00.to.10	-1,228,651	7,187	36,268	60,047	94,881	1,000,001

A note on inflation calculations. You can google “inflation calculator” and you will find a bunch of sites that give you the conversion. Just select 2010 as the final year, your input year, and use $1 as the starting value. For example:

https://westegg.com/inflation/

You will get slightly different results by site.

Otherwise, you can always estimate it if you know the average long-term inflation rate. Since 2000 the rate has averaged about 2.5% in the US, so you can use:

# 10 year inflation factor
(1.025)^10
[1] 1.280085

Prior to 2000 the rates were higher and they fluctuated quite a bit through the 70’s and 80’s so if you are going back in time further it is better to use a calculator. The calculators should use the actual rates by year.

Histogram of MHV

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 2000 to 2010",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

# function to control plot() formatting 
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=2.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

Compare 2000 to 2010 distributions.

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )



hist( mhv.10/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2010", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v00, df$v10, 
       lab1="MHV in 2000", lab2="MHV in 2010",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )

Change in MHV 2000-2010

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

The percent change variable provides some context for growth rates of value in census tracts.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.96891  0.05918  0.25402  0.33167  0.49556 60.59261      220

# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )

## [1] 112

# preview tracts with large increases in home values 
# to see if increases make sense 

d %>% 
  filter( pct.change > 5 ) %>% 
  head()

Plot the percent change variable:

hg <-
hist( pct.change, breaks=2000, 
      xlim=c(-1,2), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray40", border="white" )

axis( side=1, at=seq( from=-1, to=2, by=0.5 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(100*mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(100*median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Group Growth Rates By Metro Area

We often want to disagregate descriptives by some grouping in the data, such as metro areas.

dplyr makes this easy by grouping then summarizing the data.

d$mhv.change <- mhv.change 
d$pct.change <- pct.change
d$mhv.10 <- mhv.10
d$mhv.00 <- mhv.00

d %>%
  group_by( cbsaname ) %>%
  summarize( ave.change = median( mhv.change, na.rm=T ),
             ave.change.d = dollar( round(ave.change,0) ),
             growth = 100 * median( pct.change, na.rm=T ) ) %>%
  ungroup() %>%
  arrange( - growth ) %>%
  select( - ave.change ) %>% 
  head( 25 ) %>%
  pander()

cbsaname	ave.change.d	growth
Ocean City, NJ	$154,667	93.07
Virginia Beach-Norfolk-Newport News, VA	$97,955	71.81
Casper, WY	$70,770	70.03
New York-Wayne-White Plains, NY-NJ	$189,118	69.86
Kingston, NY	$89,272	65.08
Barnstable Town, MA	$146,088	65.07
Washington-Arlington-Alexandria DC-VA	$139,136	64.82
Charlottesville, VA	$104,487	62.74
Atlan City, NJ	$94,239	62.32
Baltimore-Towson, MD	$102,918	61.9
Bethesda-Frederick-Gaithersburg, MD	$146,639	61.46
San Luis Obispo-Paso Robles, CA	$172,722	61.01
Midland, TX	$54,576	60.3
Los Angeles-Long Beach-Santa Ana, CA	$145,968	60.3
Redding, CA	$87,677	59.12
Honolulu, HI	$194,665	58.85
Chico, CA	$81,746	57.83
Nassau-Suffolk, NY	$149,785	57.62
Edison, NJ	$125,056	57.19
Santa Ana-Anaheim-Irvine, CA	$177,367	56.05
Poughkeepsie-Newburgh-Middletown, NY	$101,010	55.18
Flagstaff, AZ	$71,072	54.67
Salisbury, MD	$60,400	54.06
Winchester, VA-WV	$73,165	53.76
Odessa, TX	$26,240	53.37

Measuring Gentrification

The original merged dataset we saved as d.full so we don’t need to reload it:

Recall our data steps thus far:

# adjust 2000 home values for inflation 
mhv.00 <- d.full$mhmval00 * 1.28855  
mhv.10 <- d.full$mhmval12

mhv.change <- mhv.10 - mhv.00

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -96.891    5.918   25.402   33.167   49.556 6059.261      220

d.full$mhv.00 <- mhv.00
d.full$mhv.10 <- mhv.10
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change

Select Gentrification Variables

Select variables for operationalizing a definition of gentrification:

We need to add some variables from 2010:

Recall we created a 2000 to 2010 variable list for reference:

head( vars )

d3 <- select( d.full, 
             
             tractid, cbsa, cbsaname,            # ids / units of analysis
             
             mhv.00, mhv.10, mhv.change, pct.change,    # home value 
             
             hinc00, hu00, own00, rent00,        # ses
             hinc12, hu10, own10, rent10,
             
             empclf00, clf00, unemp00, prof00,   # employment 
             empclf12, clf12, unemp12, prof12,
             
             dpov00, npov00,                     # poverty
             dpov12, npov12,
             
             ag25up00, hs00, col00,              # education 
             ag25up12, hs12, col12,
             
             pop00.x, nhwht00, nhblk00, hisp00, asian00,   # race
             pop10, nhwht10, nhblk10, hisp10, asian10
             
          ) # end select


d3 <- 
  d3 %>%
  mutate( 
          # 2000 variables
          p.white.00 = 100 * nhwht00 / pop00.x,
          p.black.00 = 100 * nhblk00 / pop00.x,
          p.hisp.00 = 100 * hisp00 / pop00.x, 
          p.asian.00 = 100 * asian00 / pop00.x,
          p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
          p.col.edu.00 = 100 * col00 / ag25up00,
          p.prof.00 = 100 * prof00 / empclf00,
          p.unemp.00 = 100 * unemp00 / clf00,
          pov.rate.00 = 100 * npov00 / dpov00,
          
          # 2010 variables
          p.white.10 = 100 * nhwht10 / pop10,
          p.black.10 = 100 * nhblk10 / pop10,
          p.hisp.10 = 100 * hisp10 / pop10, 
          p.asian.10 = 100 * asian10 / pop10,
          p.hs.edu.10 = 100 * (hs12+col12) / ag25up12,
          p.col.edu.10 = 100 * col12 / ag25up12,
          p.prof.10 = 100 * prof12 / empclf12,
          p.unemp.10 = 100 * unemp12 / clf12,
          pov.rate.10 = 100 * npov12 / dpov12 )

d3 <-
  d3 %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.pct.00 = ntile( mhv.00, 100 ),
          metro.mhv.pct.10 = ntile( mhv.10, 100 ),
          metro.median.pay.00 = median( hinc00, na.rm=T ),
          metro.median.pay.10 = median( hinc12, na.rm=T ),
          metro.race.rank.00 = ntile( (100-p.white.00), 100 ) ) %>%
  ungroup() %>%
  mutate( metro.mhv.pct.change = metro.mhv.pct.10 - metro.mhv.pct.00,
          pay.change = metro.median.pay.10 - metro.median.pay.00,
          race.change = p.white.10 - p.white.00,
          mhv.change = mhv.10 - mhv.00 )

Descriptive Statistics of Change Variables

d3 <-           
  d3 %>%
  select( c( "tractid", "cbsa", "cbsaname",
             "mhv.00", "mhv.10", "mhv.change","pct.change",
          "p.white.00", "p.black.00", "p.hisp.00", "p.asian.00", 
          "p.hs.edu.00", "p.col.edu.00", "p.prof.00",  "p.unemp.00", 
          "pov.rate.00", "p.white.10", "p.black.10", "p.hisp.10", 
          "p.asian.10", "p.hs.edu.10", "p.col.edu.10", "p.prof.10", 
          "p.unemp.10", "pov.rate.10", "metro.mhv.pct.00", 
          "metro.mhv.pct.10", "metro.median.pay.00", "metro.median.pay.10", 
          "metro.mhv.pct.change", "pay.change", "race.change",
          "metro.race.rank.00") ) 
  
# head( d3 ) %>% pander()

d3 <- data.frame(d3)
stargazer( d3, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

mhv.00	11,167	105,661	154,903	187,129	224,337	1,288,551
mhv.10	9,999	123,200	193,200	246,570	312,000	1,000,001
mhv.change	-1,228,651	7,101	35,981	59,372	94,140	983,765
pct.change	-97	6	25	33	50	6,059
p.white.00	0	47	78	67	91	100
p.black.00	0	1	4	14	14	100
p.hisp.00	0	2	4	13	15	100
p.asian.00	0	1	2	5	5	95
p.hs.edu.00	0	67	72	72	77	100
p.col.edu.00	0	12	21	26	36	100
p.prof.00	0	23	31	34	43	100
p.unemp.00	0	3	5	6	8	100
pov.rate.00	0	4	9	12	17	100
p.white.10	0	37	70	60	87	100
p.black.10	0	2	5	15	17	100
p.hisp.10	0	3	7	17	21	100
p.asian.10	0	1	3	6	6	96
p.hs.edu.10	0	66	71	71	77	100
p.col.edu.10	0	15	25	30	41	100
p.prof.10	0	24	34	35	46	100
p.unemp.10	0	6	9	10	13	100
pov.rate.10	0	6	12	16	22	100
metro.mhv.pct.00	1	20	41	45	68	100
metro.mhv.pct.10	1	20	41	45	68	100
metro.median.pay.00	23,012	39,457	43,139	45,054	49,522	73,701
metro.median.pay.10	30,337	47,736	54,728	55,905	60,119	96,033
metro.mhv.pct.change	-99	-6	-1	0	4	99
pay.change	-3,207	7,495	9,764	10,851	13,809	27,310
race.change	-100	-10	-5	-6	-1	90
metro.race.rank.00	1	20	41	45	68	100

Operationalizing Gentrification

Which definition did you select for gentrification, and how would you operationalize it?

# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?


# home value in lower than average home in a metro in 2000
poor.2000 <- d3$metro.mhv.pct.00 < 50  

# above average diversity for metro
diverse.2000 <- d3$metro.race.rank.00 > 50 

# home values increased more than overall city gains 
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0

# faster than average growth  
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25 

# proportion of whites increases by more than 3 percent 
# measured by increase in white
loss.diversity <- d3$race.change > 3 

g.flag <- poor.2000 & diverse.2000 & mhv.pct.increase & home.val.rise & loss.diversity

num.candidates <-  sum( poor.2000 & diverse.2000, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )

num.gentrified

## [1] 1137

num.candidates

## [1] 18219

num.gentrified / num.candidates

## [1] 0.06240738

By this definition only 5.7 percent of urban tracts experience gentrification between 2000 and 2010.

This might skew numbers?

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.00[ mhv.00 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -96.891    5.918   25.402   33.167   49.556 6059.261      220

Discussion

What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).

Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?

Spatial Visualization

Dust off your GIS skills from CPP 529 so that you can visualize some of your metrics.

You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.

The lab from CPP 528 that covers creating Dorling cartograms is locate here:

https://ds4ps.org/cpp-529-master/labs/lab-04-instructions.html

# devtools::install_github( "sjewo/cartogram" )
# install.packages( "tmap" )

library( geojsonio )  # read geoJSON map files from GitHub
library( sp )         # spatial data class sp for shapefiles
library( cartogram )  # spatial maps w/ tract size bias reduction
library( tmap )       # thematic maps
library( maptools )   # spatial object manipulation 
library( sf )         # 'simple features' flavor of shapefiles


# we have phoenix already packaged and on GitHub for easy load: 

github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url,  what="sp" )
plot( phx )

# create small dataframe for the merge
df <- data.frame(  tractid=d$tractid, 
        mhv.00,  mhv.10,  mhv.change,  pct.change  )

# create GEOID that matches GIS format

# create a geoID for merging by tract 
df$GEOID <- substr( df$tractid, 6, 18 )  # extract codes
df$GEOID <- gsub( "-", "", df$GEOID )    # remove hyphens
class( df$GEOID )

## [1] "character"

head( df$GEOID )

## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"

Note the current class of the PHX shapefile IDs:

GEOID2 is numeric, has leading zero dropped
GEOID is a factor that retains leading zeros

You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.

Or keep it as a character vector and merge with GEOID.

head( phx@data )  # sp class from GIS file, so data frame is located @data

# merge census data with dorling map

nrow( phx ) # check dimensions

## [1] 913

phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx )

## [1] 913

phx <- spTransform( phx, CRS("+init=epsg:3395") )

bb <- st_bbox( c( xmin = -12519146, xmax = -12421368, 
                 ymax = 3965924, ymin = 3899074 ), 
               crs = st_crs("+init=epsg:3395")) 

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

Data Science for
Public Service

Labs designed by Jesse D. Lecy
Source code is available on GitHub

Creative Common License:
(CC BY-NC-SA 4.0)

CPP 528 Foundations of Data Science III
Part of the MS in Evaluation and Analytics
@ Arizona State University