Introduction
This is a tutorial that shows you some common preprocessing steps you will need to do in this week’s lab.
Please use the import::here
function
This tutorial does not use import::here()
but your submission for week 03 requires its usage. The code is kept here in order to help you understand why certain decisions were made.
If you need to see how to use the import::here()
function, please see the “Project Data Steps” tutorial.
library( dplyr )
library( here )
library( knitr )
library( pander )
library( stargazer )
library( scales )
# set stargazer type to text for
# previewing in RMD docs but
# convert to type HTML when knitting
# (next code chunk)
<- "text" s.type
###################################
#
# STARGAZER SETTINGS
#
###################################
# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs
<- "html" s.type
For this part of the project you will:
- Calculate change in the MHV variable between 2000 and 2010.
- Measure gentrification that occurs between 2000 and 2010.
- Pick a city and create a new dorling cartogram to visualize your data.
- Prepare descriptive statistics and chloropleth maps describing the MHV change variable and gentrification.
This tutorial demonstrates some of the initial steps of variable creation and data exploration for the project.
Data
See the data steps for the wrangling that occurs during the process of creating our rodeo datasets.
Reminder: stop using static file paths and start using there here::here()
function
I cannot stress this enough: all data files must be imported using relative paths. You cannot use a static file (i.e. /Users/myname/some/file/path
) because it breaks portability and therefore makes it impossible for anyone to reproduce your analysis.
Reason being is that no one else can rerun your file because that static file path belongs to you and only you.For a refresher on why you cannot use static file paths and need to use the here::here()
function, please see Week 02’s lecture videos.
Before moving on, be sure to run the Data Steps tutorial
The following chunks depend on you having clean data in your data/rodeo
folder. These files are generated by following along the Data Steps Tutorial tutorial.
To re-run the tutorial locally, follow these steps:
- Download a local copy of the
labs/utilities.R
file; - Save the file as
labs/wk03/utilities.R
. - Download a local copy the
labs/project_data_steps.R
file; - Save the file as
labs/wk03/project_data_steps.R
. - Select all the lines within the
labs/wk03/project_data_steps.R
and run them all to produce all of the clean data files withindata/rodeo
.
# note: please do not use static file paths
# note: notice down below the use of here::here()
<- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d1 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
md
# check to make sure we are not losing
# or gaining observations in the merge
nrow( d1 )
## [1] 72693
<- select( d1, - year )
d1 <- select( d2, - year )
d2
<- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
d
nrow( d )
## [1] 72693
Filter Rural Districts
table( d$urban )
##
## rural urban
## 12971 59722
<- filter( d, urban == "urban" ) d
Identify Common Variables
We can create a function to compare variables from the 2000 and 2010 datasets:
# find variables that are in both files
<- function( df1, df2 )
compare_dfs
{# use regular expressions to remove numeric suffixes
.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names
.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
var.names
<- intersect( var.names.1, var.names.2 ) %>% sort()
shared print( "SHARED VARIABLES:")
print( shared )
<- c( setdiff( var.names.1, var.names.2 ),
not.shared setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
<- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars1 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
d.vars2 <- rbind( d.vars1, d.vars2 )
dd
return( dd )
}
<- compare_dfs( df1=d1, df2=d2 ) vars
## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag18cv" "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv"
## [22] "agewht" "asian" "china" "clf" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dmulti" "dnapov" "dpov"
## [36] "dwpov" "empclf" "family" "fb" "fhh" "filip" "flabf"
## [43] "geanc" "gefb" "h10yrs" "h30old" "haw" "hh" "hha"
## [50] "hhb" "hhh" "hhw" "hinc" "hinca" "hincb" "hinch"
## [57] "hincw" "hisp" "hs" "hu" "incpc" "india" "iranc"
## [64] "irfb" "itanc" "itfb" "japan" "korea" "lep" "manuf"
## [71] "mar" "mex" "mhmval" "mrent" "multi" "n10imm" "n65pov"
## [78] "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov" "nhwht"
## [85] "nnapov" "npov" "ntv" "nwpov" "ohu" "olang" "own"
## [92] "pop" "pr" "prof" "rent" "ruanc" "rufb" "scanc"
## [99] "scfb" "semp" "tractid" "unemp" "vac" "vet" "viet"
## [106] "wds"
## [1] "NOT SHARED:"
## [1] "a65asn" "a65blk" "a65hsp" "a65ntv" "a65wht" "cni16u" "dis"
## [8] "hu00sp" "ohu00sp" "p10imm" "p10yrs" "p15asn" "p15blk" "p15hsp"
## [15] "p15ntv" "p15wht" "p18und" "p30old" "p60up" "p65asn" "p65blk"
## [22] "p65hsp" "p65ntv" "p65pov" "p65wht" "p75up" "papov" "pasian"
## [29] "pbpov" "pchina" "pcol" "pcuban" "pfb" "pfhh" "pfilip"
## [36] "pflabf" "pfmpov" "pgeanc" "pgefb" "phaw" "phisp" "phpov"
## [43] "phs" "pindia" "piranc" "pirfb" "pitanc" "pitfb" "pjapan"
## [50] "pkorea" "plep" "pmanuf" "pmar" "pmex" "pmulti" "pnapov"
## [57] "pnat" "pnhblk" "pnhwht" "pntv" "polang" "pown" "ppov"
## [64] "ppr" "pprof" "pruanc" "prufb" "pscanc" "pscfb" "psemp"
## [71] "punemp" "pvac" "pvet" "pviet" "pwds" "pwpov"
head( vars )
Create Dataset for Analysis
Create subset for the analysis
<- d # keep a copy so don't have to reload d.full
<- d.full # story original in case you need to reset anything
d
<- select( d, tractid, mhmval00, mhmval12, hinc00,
d
hu00, own00, rent00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )<-
d %>%
d mutate( p.white = 100 * nhwht00 / pop00.x,
p.black = 100 * nhblk00 / pop00.x,
p.hisp = 100 * hisp00 / pop00.x,
p.asian = 100 * asian00 / pop00.x,
p.hs = 100 * (hs00+col00) / ag25up00,
p.col = 100 * col00 / ag25up00,
p.prof = 100 * prof00 / empclf00,
p.unemp = 100 * unemp00 / clf00,
pov.rate = 100 * npov00 / dpov00 )
If you call the stargazer() function with a linear model object (a regression model) it will create a nicely-formatted regression table for you.
If you instead use a data frame object it will create a table of descriptive statistics.
You can set the statistics you would like reported in the table with the summary.stat= argument.
stargazer( d,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhmval00 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
mhmval12 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
hinc00 | 2,499 | 33,000 | 43,825 | 47,657 | 58,036 | 200,001 |
hu00 | 0 | 1,102 | 1,519 | 1,570 | 1,999 | 11,522 |
own00 | 0 | 542 | 902 | 939 | 1,289 | 4,911 |
rent00 | 0 | 195 | 398 | 516 | 712 | 8,544 |
empclf00 | 0 | 1,205 | 1,756 | 1,820 | 2,373 | 10,334 |
clf00 | 0 | 1,302 | 1,865 | 1,930 | 2,502 | 11,251 |
unemp00 | 0 | 51 | 87 | 110 | 140 | 6,405 |
prof00 | 0 | 299 | 539 | 637 | 873 | 6,610 |
dpov00 | 0 | 2,671 | 3,718 | 3,804 | 4,871 | 23,892 |
npov00 | 0 | 149 | 304 | 452 | 601 | 5,515 |
ag25up00 | 0 | 1,763 | 2,451 | 2,520 | 3,224 | 17,974 |
hs00 | 0 | 665 | 1,071 | 1,155 | 1,552 | 8,909 |
col00 | 0 | 243 | 492 | 665 | 923 | 9,313 |
pop00.x | 0 | 2,751 | 3,802 | 3,901 | 4,976 | 36,206 |
nhwht00 | 0 | 1,308 | 2,514 | 2,591 | 3,713 | 20,619 |
nhblk00 | 0 | 41 | 141 | 522 | 527 | 14,039 |
hisp00 | 0 | 55 | 153 | 547 | 533 | 13,391 |
asian00 | 0 | 22 | 65 | 189 | 183 | 9,491 |
p.white | 0 | 47 | 78 | 67 | 91 | 100 |
p.black | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs | 0 | 67 | 72 | 72 | 77 | 100 |
p.col | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate | 0 | 4 | 9 | 12 | 17 | 100 |
Exploration of Median Home Value
Initial conditions in 2000:
# adjust 2000 home values for inflation
.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
mhv
<- mhv.10 - mhv.00
mhv.change
<- data.frame( MedianHomeValue2000=mhv.00,
df MedianHomeValue2010=mhv.10,
Change.00.to.10=mhv.change )
stargazer( df,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
MedianHomeValue2000 | 0 | 105,146 | 154,497 | 186,502 | 224,071 | 1,288,551 |
MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
A note on inflation calculations. You can google “inflation calculator” and you will find a bunch of sites that give you the conversion. Just select 2010 as the final year, your input year, and use $1 as the starting value. For example:
https://westegg.com/inflation/
You will get slightly different results by site.
Otherwise, you can always estimate it if you know the average long-term inflation rate. Since 2000 the rate has averaged about 2.5% in the US, so you can use:
# 10 year inflation factor
1.025)^10
(1] 1.280085 [
Prior to 2000 the rates were higher and they fluctuated quite a bit through the 70’s and 80’s so if you are going back in time further it is better to use a calculator. The calculators should use the actual rates by year.
Histogram of MHV
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
<- mean( mhv.change/1000, na.rm=T )
mean.x abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
<- median( mhv.change/1000, na.rm=T )
median.x abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
# function to control plot() formatting
<- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
jplot
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=2.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
<- is.finite(x1) & is.finite(x2)
ok lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}
Compare 2000 to 2010 distributions.
<- matrix( c( 1,3,
layout.matrix 2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
hist( mhv.10/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2010",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
<- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )
df
par( mar=c(4,5,3,2) )
jplot( df$v00, df$v10,
lab1="MHV in 2000", lab2="MHV in 2010",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
Change in MHV 2000-2010
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
The percent change variable provides some context for growth rates of value in census tracts.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
.00[ mhv.00 < 10000 ] <- NA
mhv<- mhv.change / mhv.00
pct.change summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.96891 0.05918 0.25402 0.33167 0.49556 60.59261 220
# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )
## [1] 112
# preview tracts with large increases in home values
# to see if increases make sense
%>%
d filter( pct.change > 5 ) %>%
head()
Plot the percent change variable:
<-
hg hist( pct.change, breaks=2000,
xlim=c(-1,2), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
col="gray40", border="white" )
axis( side=1, at=seq( from=-1, to=2, by=0.5 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
<- max( hg$count )
ymax
<- mean( pct.change, na.rm=T )
mean.x abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax),
labels=paste0( "Mean = ", round(100*mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
<- median( pct.change, na.rm=T )
median.x abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax),
labels=paste0( "Median = ", round(100*median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
Group Growth Rates By Metro Area
We often want to disagregate descriptives by some grouping in the data, such as metro areas.
dplyr makes this easy by grouping then summarizing the data.
$mhv.change <- mhv.change
d$pct.change <- pct.change
d$mhv.10 <- mhv.10
d$mhv.00 <- mhv.00
d
%>%
d group_by( cbsaname ) %>%
summarize( ave.change = median( mhv.change, na.rm=T ),
ave.change.d = dollar( round(ave.change,0) ),
growth = 100 * median( pct.change, na.rm=T ) ) %>%
ungroup() %>%
arrange( - growth ) %>%
select( - ave.change ) %>%
head( 25 ) %>%
pander()
cbsaname | ave.change.d | growth |
---|---|---|
Ocean City, NJ | $154,667 | 93.07 |
Virginia Beach-Norfolk-Newport News, VA | $97,955 | 71.81 |
Casper, WY | $70,770 | 70.03 |
New York-Wayne-White Plains, NY-NJ | $189,118 | 69.86 |
Kingston, NY | $89,272 | 65.08 |
Barnstable Town, MA | $146,088 | 65.07 |
Washington-Arlington-Alexandria DC-VA | $139,136 | 64.82 |
Charlottesville, VA | $104,487 | 62.74 |
Atlan City, NJ | $94,239 | 62.32 |
Baltimore-Towson, MD | $102,918 | 61.9 |
Bethesda-Frederick-Gaithersburg, MD | $146,639 | 61.46 |
San Luis Obispo-Paso Robles, CA | $172,722 | 61.01 |
Midland, TX | $54,576 | 60.3 |
Los Angeles-Long Beach-Santa Ana, CA | $145,968 | 60.3 |
Redding, CA | $87,677 | 59.12 |
Honolulu, HI | $194,665 | 58.85 |
Chico, CA | $81,746 | 57.83 |
Nassau-Suffolk, NY | $149,785 | 57.62 |
Edison, NJ | $125,056 | 57.19 |
Santa Ana-Anaheim-Irvine, CA | $177,367 | 56.05 |
Poughkeepsie-Newburgh-Middletown, NY | $101,010 | 55.18 |
Flagstaff, AZ | $71,072 | 54.67 |
Salisbury, MD | $60,400 | 54.06 |
Winchester, VA-WV | $73,165 | 53.76 |
Odessa, TX | $26,240 | 53.37 |
Measuring Gentrification
The original merged dataset we saved as d.full so we don’t need to reload it:
Recall our data steps thus far:
# adjust 2000 home values for inflation
.00 <- d.full$mhmval00 * 1.28855
mhv.10 <- d.full$mhmval12
mhv
<- mhv.10 - mhv.00
mhv.change
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
.00[ mhv.00 < 10000 ] <- NA
mhv<- 100 * ( mhv.change / mhv.00 )
pct.change summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
$mhv.00 <- mhv.00
d.full$mhv.10 <- mhv.10
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change d.full
Select Gentrification Variables
Select variables for operationalizing a definition of gentrification:
We need to add some variables from 2010:
Recall we created a 2000 to 2010 variable list for reference:
head( vars )
<- select( d.full,
d3
# ids / units of analysis
tractid, cbsa, cbsaname,
.00, mhv.10, mhv.change, pct.change, # home value
mhv
# ses
hinc00, hu00, own00, rent00,
hinc12, hu10, own10, rent10,
# employment
empclf00, clf00, unemp00, prof00,
empclf12, clf12, unemp12, prof12,
# poverty
dpov00, npov00,
dpov12, npov12,
# education
ag25up00, hs00, col00,
ag25up12, hs12, col12,
# race
pop00.x, nhwht00, nhblk00, hisp00, asian00,
pop10, nhwht10, nhblk10, hisp10, asian10
# end select
)
<-
d3 %>%
d3 mutate(
# 2000 variables
p.white.00 = 100 * nhwht00 / pop00.x,
p.black.00 = 100 * nhblk00 / pop00.x,
p.hisp.00 = 100 * hisp00 / pop00.x,
p.asian.00 = 100 * asian00 / pop00.x,
p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
p.col.edu.00 = 100 * col00 / ag25up00,
p.prof.00 = 100 * prof00 / empclf00,
p.unemp.00 = 100 * unemp00 / clf00,
pov.rate.00 = 100 * npov00 / dpov00,
# 2010 variables
p.white.10 = 100 * nhwht10 / pop10,
p.black.10 = 100 * nhblk10 / pop10,
p.hisp.10 = 100 * hisp10 / pop10,
p.asian.10 = 100 * asian10 / pop10,
p.hs.edu.10 = 100 * (hs12+col12) / ag25up12,
p.col.edu.10 = 100 * col12 / ag25up12,
p.prof.10 = 100 * prof12 / empclf12,
p.unemp.10 = 100 * unemp12 / clf12,
pov.rate.10 = 100 * npov12 / dpov12 )
<-
d3 %>%
d3 group_by( cbsaname ) %>%
mutate( metro.mhv.pct.00 = ntile( mhv.00, 100 ),
metro.mhv.pct.10 = ntile( mhv.10, 100 ),
metro.median.pay.00 = median( hinc00, na.rm=T ),
metro.median.pay.10 = median( hinc12, na.rm=T ),
metro.race.rank.00 = ntile( (100-p.white.00), 100 ) ) %>%
ungroup() %>%
mutate( metro.mhv.pct.change = metro.mhv.pct.10 - metro.mhv.pct.00,
pay.change = metro.median.pay.10 - metro.median.pay.00,
race.change = p.white.10 - p.white.00,
mhv.change = mhv.10 - mhv.00 )
Descriptive Statistics of Change Variables
<-
d3 %>%
d3 select( c( "tractid", "cbsa", "cbsaname",
"mhv.00", "mhv.10", "mhv.change","pct.change",
"p.white.00", "p.black.00", "p.hisp.00", "p.asian.00",
"p.hs.edu.00", "p.col.edu.00", "p.prof.00", "p.unemp.00",
"pov.rate.00", "p.white.10", "p.black.10", "p.hisp.10",
"p.asian.10", "p.hs.edu.10", "p.col.edu.10", "p.prof.10",
"p.unemp.10", "pov.rate.10", "metro.mhv.pct.00",
"metro.mhv.pct.10", "metro.median.pay.00", "metro.median.pay.10",
"metro.mhv.pct.change", "pay.change", "race.change",
"metro.race.rank.00") )
# head( d3 ) %>% pander()
<- data.frame(d3)
d3 stargazer( d3,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhv.00 | 11,167 | 105,661 | 154,903 | 187,129 | 224,337 | 1,288,551 |
mhv.10 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
mhv.change | -1,228,651 | 7,101 | 35,981 | 59,372 | 94,140 | 983,765 |
pct.change | -97 | 6 | 25 | 33 | 50 | 6,059 |
p.white.00 | 0 | 47 | 78 | 67 | 91 | 100 |
p.black.00 | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp.00 | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian.00 | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs.edu.00 | 0 | 67 | 72 | 72 | 77 | 100 |
p.col.edu.00 | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof.00 | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp.00 | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate.00 | 0 | 4 | 9 | 12 | 17 | 100 |
p.white.10 | 0 | 37 | 70 | 60 | 87 | 100 |
p.black.10 | 0 | 2 | 5 | 15 | 17 | 100 |
p.hisp.10 | 0 | 3 | 7 | 17 | 21 | 100 |
p.asian.10 | 0 | 1 | 3 | 6 | 6 | 96 |
p.hs.edu.10 | 0 | 66 | 71 | 71 | 77 | 100 |
p.col.edu.10 | 0 | 15 | 25 | 30 | 41 | 100 |
p.prof.10 | 0 | 24 | 34 | 35 | 46 | 100 |
p.unemp.10 | 0 | 6 | 9 | 10 | 13 | 100 |
pov.rate.10 | 0 | 6 | 12 | 16 | 22 | 100 |
metro.mhv.pct.00 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.mhv.pct.10 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.median.pay.00 | 23,012 | 39,457 | 43,139 | 45,054 | 49,522 | 73,701 |
metro.median.pay.10 | 30,337 | 47,736 | 54,728 | 55,905 | 60,119 | 96,033 |
metro.mhv.pct.change | -99 | -6 | -1 | 0 | 4 | 99 |
pay.change | -3,207 | 7,495 | 9,764 | 10,851 | 13,809 | 27,310 |
race.change | -100 | -10 | -5 | -6 | -1 | 90 |
metro.race.rank.00 | 1 | 20 | 41 | 45 | 68 | 100 |
Operationalizing Gentrification
Which definition did you select for gentrification, and how would you operationalize it?
# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?
# home value in lower than average home in a metro in 2000
.2000 <- d3$metro.mhv.pct.00 < 50
poor
# above average diversity for metro
.2000 <- d3$metro.race.rank.00 > 50
diverse
# home values increased more than overall city gains
# change in percentile rank within the metro
<- d3$metro.mhv.pct.change > 0
mhv.pct.increase
# faster than average growth
# 25% growth in value is median for the country
<- d3$pct.change > 25
home.val.rise
# proportion of whites increases by more than 3 percent
# measured by increase in white
<- d3$race.change > 3
loss.diversity
<- poor.2000 & diverse.2000 & mhv.pct.increase & home.val.rise & loss.diversity
g.flag
<- sum( poor.2000 & diverse.2000, na.rm=T )
num.candidates <- sum( g.flag, na.rm=T )
num.gentrified
num.gentrified
## [1] 1137
num.candidates
## [1] 18219
/ num.candidates num.gentrified
## [1] 0.06240738
By this definition only 5.7 percent of urban tracts experience gentrification between 2000 and 2010.
This might skew numbers?
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
.00[ mhv.00 < 1000 ] <- NA
mhv<- 100 * ( mhv.change / mhv.00 )
pct.change summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
Discussion
What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).
Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?
Spatial Visualization
Dust off your GIS skills from CPP 529 so that you can visualize some of your metrics.
You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.
The lab from CPP 528 that covers creating Dorling cartograms is locate here:
https://ds4ps.org/cpp-529-master/labs/lab-04-instructions.html
# devtools::install_github( "sjewo/cartogram" )
# install.packages( "tmap" )
library( geojsonio ) # read geoJSON map files from GitHub
library( sp ) # spatial data class sp for shapefiles
library( cartogram ) # spatial maps w/ tract size bias reduction
library( tmap ) # thematic maps
library( maptools ) # spatial object manipulation
library( sf ) # 'simple features' flavor of shapefiles
# we have phoenix already packaged and on GitHub for easy load:
<- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
github.url <- geojson_read( x=github.url, what="sp" )
phx plot( phx )
# create small dataframe for the merge
<- data.frame( tractid=d$tractid,
df .00, mhv.10, mhv.change, pct.change )
mhv
# create GEOID that matches GIS format
# create a geoID for merging by tract
$GEOID <- substr( df$tractid, 6, 18 ) # extract codes
df$GEOID <- gsub( "-", "", df$GEOID ) # remove hyphens
dfclass( df$GEOID )
## [1] "character"
head( df$GEOID )
## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"
Note the current class of the PHX shapefile IDs:
- GEOID2 is numeric, has leading zero dropped
- GEOID is a factor that retains leading zeros
You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.
Or keep it as a character vector and merge with GEOID.
head( phx@data ) # sp class from GIS file, so data frame is located @data
# merge census data with dorling map
nrow( phx ) # check dimensions
## [1] 913
<- merge( phx, df, by.x="GEOID", by.y="GEOID" )
phx
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx )
## [1] 913
<- spTransform( phx, CRS("+init=epsg:3395") )
phx
<- st_bbox( c( xmin = -12519146, xmax = -12421368,
bb ymax = 3965924, ymin = 3899074 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )