Introduction
This is a tutorial that shows you some common pre-processing steps you will need to do in this week’s lab.
Please see the “Project Data Steps” tutorial first to generate your data.
library( dplyr )
library( knitr )
library( pander )
library( stargazer )
library( scales )
library( here )
# set stargazer type to text for
# previewing in RMD docs but
# convert to type HTML when knitting
# (next code chunk)
s.type <- "text"
###################################
#
# STARGAZER SETTINGS
#
###################################
# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs
s.type <- "html"
For this part of the project you will:
- Calculate change in the MHV variable between 2000 and 2010.
- Measure gentrification that occurs between 2000 and 2010.
- Pick a city and create a new dorling cartogram to visualize your data.
- Prepare descriptive statistics and chloropleth maps describing the MHV change variable and gentrification.
This tutorial demonstrates some of the initial steps of variable creation and data exploration for the project.
Data
Before we get started, please ensure that you have completed Part 1 of this tutorial, the Data Steps Tutorial.
The following chunks depend on the files in the data/rodeo
folder that are generated by following the first tutorial.
# note: please do not use static file paths
# note: notice down below the use of here::here()
d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
# check to make sure we are not losing
# or gaining observations in the merge
nrow( d1 )
## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
nrow( d )
## [1] 72693
Filter Rural Districts
table( d$urban )
##
## rural urban
## 12971 59722
d <- filter( d, urban == "urban" )
Identify Common Variables
We can create a function to compare variables from the 2000 and 2010 datasets:
# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
# use regular expressions to remove numeric suffixes
var.names.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
shared <- intersect( var.names.1, var.names.2 ) %>% sort()
print( "SHARED VARIABLES:")
print( shared )
not.shared <- c( setdiff( var.names.1, var.names.2 ),
setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
dd <- rbind( d.vars1, d.vars2 )
return( dd )
}
vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag18cv" "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv"
## [22] "agewht" "asian" "china" "clf" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dmulti" "dnapov" "dpov"
## [36] "dwpov" "empclf" "family" "fb" "fhh" "filip" "flabf"
## [43] "geanc" "gefb" "h10yrs" "h30old" "haw" "hh" "hha"
## [50] "hhb" "hhh" "hhw" "hinc" "hinca" "hincb" "hinch"
## [57] "hincw" "hisp" "hs" "hu" "incpc" "india" "iranc"
## [64] "irfb" "itanc" "itfb" "japan" "korea" "lep" "manuf"
## [71] "mar" "mex" "mhmval" "mrent" "multi" "n10imm" "n65pov"
## [78] "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov" "nhwht"
## [85] "nnapov" "npov" "ntv" "nwpov" "ohu" "olang" "own"
## [92] "pop" "pr" "prof" "rent" "ruanc" "rufb" "scanc"
## [99] "scfb" "semp" "tractid" "unemp" "vac" "vet" "viet"
## [106] "wds"
## [1] "NOT SHARED:"
## [1] "a65asn" "a65blk" "a65hsp" "a65ntv" "a65wht" "cni16u" "dis"
## [8] "hu00sp" "ohu00sp" "p10imm" "p10yrs" "p15asn" "p15blk" "p15hsp"
## [15] "p15ntv" "p15wht" "p18und" "p30old" "p60up" "p65asn" "p65blk"
## [22] "p65hsp" "p65ntv" "p65pov" "p65wht" "p75up" "papov" "pasian"
## [29] "pbpov" "pchina" "pcol" "pcuban" "pfb" "pfhh" "pfilip"
## [36] "pflabf" "pfmpov" "pgeanc" "pgefb" "phaw" "phisp" "phpov"
## [43] "phs" "pindia" "piranc" "pirfb" "pitanc" "pitfb" "pjapan"
## [50] "pkorea" "plep" "pmanuf" "pmar" "pmex" "pmulti" "pnapov"
## [57] "pnat" "pnhblk" "pnhwht" "pntv" "polang" "pown" "ppov"
## [64] "ppr" "pprof" "pruanc" "prufb" "pscanc" "pscfb" "psemp"
## [71] "punemp" "pvac" "pvet" "pviet" "pwds" "pwpov"
head( vars )
Create Dataset for Analysis
Create subset for the analysis
d.full <- d # keep a copy so don't have to reload
d <- d.full # story original in case you need to reset anything
d <- select( d, tractid, mhmval00, mhmval12, hinc00,
hu00, own00, rent00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( p.white = 100 * nhwht00 / pop00.x,
p.black = 100 * nhblk00 / pop00.x,
p.hisp = 100 * hisp00 / pop00.x,
p.asian = 100 * asian00 / pop00.x,
p.hs = 100 * (hs00+col00) / ag25up00,
p.col = 100 * col00 / ag25up00,
p.prof = 100 * prof00 / empclf00,
p.unemp = 100 * unemp00 / clf00,
pov.rate = 100 * npov00 / dpov00 )
If you call the stargazer() function with a linear model object (a regression model) it will create a nicely-formatted regression table for you.
If you instead use a data frame object it will create a table of descriptive statistics.
You can set the statistics you would like reported in the table with the summary.stat= argument.
stargazer( d,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhmval00 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
mhmval12 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
hinc00 | 2,499 | 33,000 | 43,825 | 47,657 | 58,036 | 200,001 |
hu00 | 0 | 1,102 | 1,519 | 1,570 | 1,999 | 11,522 |
own00 | 0 | 542 | 902 | 939 | 1,289 | 4,911 |
rent00 | 0 | 195 | 398 | 516 | 712 | 8,544 |
empclf00 | 0 | 1,205 | 1,756 | 1,820 | 2,373 | 10,334 |
clf00 | 0 | 1,302 | 1,865 | 1,930 | 2,502 | 11,251 |
unemp00 | 0 | 51 | 87 | 110 | 140 | 6,405 |
prof00 | 0 | 299 | 539 | 637 | 873 | 6,610 |
dpov00 | 0 | 2,671 | 3,718 | 3,804 | 4,871 | 23,892 |
npov00 | 0 | 149 | 304 | 452 | 601 | 5,515 |
ag25up00 | 0 | 1,763 | 2,451 | 2,520 | 3,224 | 17,974 |
hs00 | 0 | 665 | 1,071 | 1,155 | 1,552 | 8,909 |
col00 | 0 | 243 | 492 | 665 | 923 | 9,313 |
pop00.x | 0 | 2,751 | 3,802 | 3,901 | 4,976 | 36,206 |
nhwht00 | 0 | 1,308 | 2,514 | 2,591 | 3,713 | 20,619 |
nhblk00 | 0 | 41 | 141 | 522 | 527 | 14,039 |
hisp00 | 0 | 55 | 153 | 547 | 533 | 13,391 |
asian00 | 0 | 22 | 65 | 189 | 183 | 9,491 |
p.white | 0 | 47 | 78 | 67 | 91 | 100 |
p.black | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs | 0 | 67 | 72 | 72 | 77 | 100 |
p.col | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate | 0 | 4 | 9 | 12 | 17 | 100 |
Exploration of Median Home Value
Initial conditions in 2000:
# adjust 2000 home values for inflation
mhv.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
mhv.change <- mhv.10 - mhv.00
df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
Change.00.to.10=mhv.change )
stargazer( df,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
MedianHomeValue2000 | 0 | 105,146 | 154,497 | 186,502 | 224,071 | 1,288,551 |
MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
A note on inflation calculations. You can google “inflation calculator” and you will find a bunch of sites that give you the conversion. Just select 2010 as the final year, your input year, and use $1 as the starting value. For example:
https://westegg.com/inflation/
You will get slightly different results by site.
Otherwise, you can always estimate it if you know the average long-term inflation rate. Since 2000 the rate has averaged about 2.5% in the US, so you can use:
# 10 year inflation factor
(1.025)^10
[1] 1.280085
Prior to 2000 the rates were higher and they fluctuated quite a bit through the 70’s and 80’s so if you are going back in time further it is better to use a calculator. The calculators should use the actual rates by year.
Histogram of MHV
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
# function to control plot() formatting
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=2.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
ok <- is.finite(x1) & is.finite(x2)
lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}
Compare 2000 to 2010 distributions.
layout.matrix <- matrix( c( 1,3,
2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
hist( mhv.10/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2010",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )
par( mar=c(4,5,3,2) )
jplot( df$v00, df$v10,
lab1="MHV in 2000", lab2="MHV in 2010",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
Change in MHV 2000-2010
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
The percent change variable provides some context for growth rates of value in census tracts.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.96891 0.05918 0.25402 0.33167 0.49556 60.59261 220
# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )
## [1] 112
# preview tracts with large increases in home values
# to see if increases make sense
d %>%
filter( pct.change > 5 ) %>%
head()
Plot the percent change variable:
hg <-
hist( pct.change, breaks=2000,
xlim=c(-1,2), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
col="gray40", border="white" )
axis( side=1, at=seq( from=-1, to=2, by=0.5 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax),
labels=paste0( "Mean = ", round(100*mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax),
labels=paste0( "Median = ", round(100*median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
Group Growth Rates By Metro Area
We often want to disagregate descriptives by some grouping in the data, such as metro areas.
dplyr makes this easy by grouping then summarizing the data.
d$mhv.change <- mhv.change
d$pct.change <- pct.change
d$mhv.10 <- mhv.10
d$mhv.00 <- mhv.00
d %>%
group_by( cbsaname ) %>%
summarize( ave.change = median( mhv.change, na.rm=T ),
ave.change.d = dollar( round(ave.change,0) ),
growth = 100 * median( pct.change, na.rm=T ) ) %>%
ungroup() %>%
arrange( - growth ) %>%
select( - ave.change ) %>%
head( 25 ) %>%
pander()
cbsaname | ave.change.d | growth |
---|---|---|
Ocean City, NJ | $154,667 | 93.07 |
Virginia Beach-Norfolk-Newport News, VA | $97,955 | 71.81 |
Casper, WY | $70,770 | 70.03 |
New York-Wayne-White Plains, NY-NJ | $189,118 | 69.86 |
Kingston, NY | $89,272 | 65.08 |
Barnstable Town, MA | $146,088 | 65.07 |
Washington-Arlington-Alexandria DC-VA | $139,136 | 64.82 |
Charlottesville, VA | $104,487 | 62.74 |
Atlan City, NJ | $94,239 | 62.32 |
Baltimore-Towson, MD | $102,918 | 61.9 |
Bethesda-Frederick-Gaithersburg, MD | $146,639 | 61.46 |
San Luis Obispo-Paso Robles, CA | $172,722 | 61.01 |
Midland, TX | $54,576 | 60.3 |
Los Angeles-Long Beach-Santa Ana, CA | $145,968 | 60.3 |
Redding, CA | $87,677 | 59.12 |
Honolulu, HI | $194,665 | 58.85 |
Chico, CA | $81,746 | 57.83 |
Nassau-Suffolk, NY | $149,785 | 57.62 |
Edison, NJ | $125,056 | 57.19 |
Santa Ana-Anaheim-Irvine, CA | $177,367 | 56.05 |
Poughkeepsie-Newburgh-Middletown, NY | $101,010 | 55.18 |
Flagstaff, AZ | $71,072 | 54.67 |
Salisbury, MD | $60,400 | 54.06 |
Winchester, VA-WV | $73,165 | 53.76 |
Odessa, TX | $26,240 | 53.37 |
Measuring Gentrification
The original merged dataset we saved as d.full so we don’t need to reload it:
Recall our data steps thus far:
# adjust 2000 home values for inflation
mhv.00 <- d.full$mhmval00 * 1.28855
mhv.10 <- d.full$mhmval12
mhv.change <- mhv.10 - mhv.00
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
d.full$mhv.00 <- mhv.00
d.full$mhv.10 <- mhv.10
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change
Select Gentrification Variables
Select variables for operationalizing a definition of gentrification:
We need to add some variables from 2010:
Recall we created a 2000 to 2010 variable list for reference:
head( vars )
d3 <- select( d.full,
tractid, cbsa, cbsaname, # ids / units of analysis
mhv.00, mhv.10, mhv.change, pct.change, # home value
hinc00, hu00, own00, rent00, # ses
hinc12, hu10, own10, rent10,
empclf00, clf00, unemp00, prof00, # employment
empclf12, clf12, unemp12, prof12,
dpov00, npov00, # poverty
dpov12, npov12,
ag25up00, hs00, col00, # education
ag25up12, hs12, col12,
pop00.x, nhwht00, nhblk00, hisp00, asian00, # race
pop10, nhwht10, nhblk10, hisp10, asian10
) # end select
d3 <-
d3 %>%
mutate(
# 2000 variables
p.white.00 = 100 * nhwht00 / pop00.x,
p.black.00 = 100 * nhblk00 / pop00.x,
p.hisp.00 = 100 * hisp00 / pop00.x,
p.asian.00 = 100 * asian00 / pop00.x,
p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
p.col.edu.00 = 100 * col00 / ag25up00,
p.prof.00 = 100 * prof00 / empclf00,
p.unemp.00 = 100 * unemp00 / clf00,
pov.rate.00 = 100 * npov00 / dpov00,
# 2010 variables
p.white.10 = 100 * nhwht10 / pop10,
p.black.10 = 100 * nhblk10 / pop10,
p.hisp.10 = 100 * hisp10 / pop10,
p.asian.10 = 100 * asian10 / pop10,
p.hs.edu.10 = 100 * (hs12+col12) / ag25up12,
p.col.edu.10 = 100 * col12 / ag25up12,
p.prof.10 = 100 * prof12 / empclf12,
p.unemp.10 = 100 * unemp12 / clf12,
pov.rate.10 = 100 * npov12 / dpov12 )
d3 <-
d3 %>%
group_by( cbsaname ) %>%
mutate( metro.mhv.pct.00 = ntile( mhv.00, 100 ),
metro.mhv.pct.10 = ntile( mhv.10, 100 ),
metro.median.pay.00 = median( hinc00, na.rm=T ),
metro.median.pay.10 = median( hinc12, na.rm=T ),
metro.race.rank.00 = ntile( (100-p.white.00), 100 ) ) %>%
ungroup() %>%
mutate( metro.mhv.pct.change = metro.mhv.pct.10 - metro.mhv.pct.00,
pay.change = metro.median.pay.10 - metro.median.pay.00,
race.change = p.white.10 - p.white.00,
mhv.change = mhv.10 - mhv.00 )
Descriptive Statistics of Change Variables
d3 <-
d3 %>%
select( c( "tractid", "cbsa", "cbsaname",
"mhv.00", "mhv.10", "mhv.change","pct.change",
"p.white.00", "p.black.00", "p.hisp.00", "p.asian.00",
"p.hs.edu.00", "p.col.edu.00", "p.prof.00", "p.unemp.00",
"pov.rate.00", "p.white.10", "p.black.10", "p.hisp.10",
"p.asian.10", "p.hs.edu.10", "p.col.edu.10", "p.prof.10",
"p.unemp.10", "pov.rate.10", "metro.mhv.pct.00",
"metro.mhv.pct.10", "metro.median.pay.00", "metro.median.pay.10",
"metro.mhv.pct.change", "pay.change", "race.change",
"metro.race.rank.00") )
# head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhv.00 | 11,167 | 105,661 | 154,903 | 187,129 | 224,337 | 1,288,551 |
mhv.10 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
mhv.change | -1,228,651 | 7,101 | 35,981 | 59,372 | 94,140 | 983,765 |
pct.change | -97 | 6 | 25 | 33 | 50 | 6,059 |
p.white.00 | 0 | 47 | 78 | 67 | 91 | 100 |
p.black.00 | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp.00 | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian.00 | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs.edu.00 | 0 | 67 | 72 | 72 | 77 | 100 |
p.col.edu.00 | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof.00 | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp.00 | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate.00 | 0 | 4 | 9 | 12 | 17 | 100 |
p.white.10 | 0 | 37 | 70 | 60 | 87 | 100 |
p.black.10 | 0 | 2 | 5 | 15 | 17 | 100 |
p.hisp.10 | 0 | 3 | 7 | 17 | 21 | 100 |
p.asian.10 | 0 | 1 | 3 | 6 | 6 | 96 |
p.hs.edu.10 | 0 | 66 | 71 | 71 | 77 | 100 |
p.col.edu.10 | 0 | 15 | 25 | 30 | 41 | 100 |
p.prof.10 | 0 | 24 | 34 | 35 | 46 | 100 |
p.unemp.10 | 0 | 6 | 9 | 10 | 13 | 100 |
pov.rate.10 | 0 | 6 | 12 | 16 | 22 | 100 |
metro.mhv.pct.00 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.mhv.pct.10 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.median.pay.00 | 23,012 | 39,457 | 43,139 | 45,054 | 49,522 | 73,701 |
metro.median.pay.10 | 30,337 | 47,736 | 54,728 | 55,905 | 60,119 | 96,033 |
metro.mhv.pct.change | -99 | -6 | -1 | 0 | 4 | 99 |
pay.change | -3,207 | 7,495 | 9,764 | 10,851 | 13,809 | 27,310 |
race.change | -100 | -10 | -5 | -6 | -1 | 90 |
metro.race.rank.00 | 1 | 20 | 41 | 45 | 68 | 100 |
Operationalizing Gentrification
Which definition did you select for gentrification, and how would you operationalize it?
# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?
# home value in lower than average home in a metro in 2000
poor.2000 <- d3$metro.mhv.pct.00 < 50
# above average diversity for metro
diverse.2000 <- d3$metro.race.rank.00 > 50
# home values increased more than overall city gains
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0
# faster than average growth
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25
# proportion of whites increases by more than 3 percent
# measured by increase in white
loss.diversity <- d3$race.change > 3
g.flag <- poor.2000 & diverse.2000 & mhv.pct.increase & home.val.rise & loss.diversity
num.candidates <- sum( poor.2000 & diverse.2000, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )
num.gentrified
## [1] 1137
num.candidates
## [1] 18219
num.gentrified / num.candidates
## [1] 0.06240738
By this definition only 5.7 percent of urban tracts experience gentrification between 2000 and 2010.
This might skew numbers?
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.00[ mhv.00 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
Discussion
What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).
Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?
Spatial Visualization
Dust off your GIS skills from CPP 529 so that you can visualize some of your metrics.
You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.
The lab from CPP 528 that covers creating Dorling cartograms is locate here:
https://ds4ps.org/cpp-529-master/labs/lab-04-instructions.html
# devtools::install_github( "sjewo/cartogram" )
# install.packages( "tmap" )
library( geojsonio ) # read geoJSON map files from GitHub
library( sp ) # spatial data class sp for shapefiles
library( cartogram ) # spatial maps w/ tract size bias reduction
library( tmap ) # thematic maps
library( maptools ) # spatial object manipulation
library( sf ) # 'simple features' flavor of shapefiles
# we have phoenix already packaged and on GitHub for easy load:
github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url, what="sp" )
plot( phx )
# create small dataframe for the merge
df <- data.frame( tractid=d$tractid,
mhv.00, mhv.10, mhv.change, pct.change )
# create GEOID that matches GIS format
# create a geoID for merging by tract
df$GEOID <- substr( df$tractid, 6, 18 ) # extract codes
df$GEOID <- gsub( "-", "", df$GEOID ) # remove hyphens
class( df$GEOID )
## [1] "character"
head( df$GEOID )
## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"
Note the current class of the PHX shapefile IDs:
- GEOID2 is numeric, has leading zero dropped
- GEOID is a factor that retains leading zeros
You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.
Or keep it as a character vector and merge with GEOID.
head( phx@data ) # sp class from GIS file, so data frame is located @data
# merge census data with dorling map
nrow( phx ) # check dimensions
## [1] 913
phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx )
## [1] 913
phx <- spTransform( phx, CRS("+init=epsg:3395") )
bb <- st_bbox( c( xmin = -12519146, xmax = -12421368,
ymax = 3965924, ymin = 3899074 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )