This tutorial contains the steps the team took to create a data subset of the LTDB files and conduct descriptive analysis of that subset.
library( here )
library( data.table )
library( dplyr )
library( pander )
library( tidyverse )
library( readr )
library( forecast )
library( knitr )
library( stargazer )
library( scales )
library( geojsonio )
library( sp )
library( sf )
library( mclust )
library( tmap )
library( ggplot2 )
library( ggthemes )
library( dplyr )
library( pander )
library( tidycensus )
library( cartogram )
library( maptools )
library( corrplot )
<- "text" s.type
In order to determine changes in home value differ between 1990 and 2000, a 1990 to 2000 (ten years) data sets were used.
<- readRDS( here::here( "data/rodeo/LTDB-1990.rds" ) )
d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
md
# check to make sure you are not losing or gaining observations in the merge
nrow( d1 )
## [1] 72693
<- select( d1, - year )
d1 <- select( d2, - year )
d2
<- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
d
nrow( d )
## [1] 72693
Filter Urban Districts
table( d$urban )
##
## rural urban
## 12971 59722
<- filter( d, urban == "urban" ) d
To enhance proper analysis of the data sets, common variables need to be identified for 1990 and 2000 data sets:
# find variables that are in both files
<- function( df1, df2 )
compare_dfs
{# use regular expressions to remove numeric suffixes
.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names
.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
var.names
<- intersect( var.names.1, var.names.2 ) %>% sort()
shared print( "SHARED VARIABLES:")
print( shared )
<- c( setdiff( var.names.1, var.names.2 ),
not.shared setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
<- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars1 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
d.vars2 <- rbind( d.vars1, d.vars2 )
dd
return( dd )
}
<- compare_dfs( df1=d1, df2=d2 ) vars
## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv" "agewht"
## [22] "asian" "china" "clf" "cni16u" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dis" "dmulti" "dnapov"
## [36] "dpov" "dwpov" "empclf" "family" "fb" "fhh" "filip"
## [43] "flabf" "geanc" "gefb" "h10yrs" "h30old" "haw" "hh"
## [50] "hha" "hhb" "hhh" "hhw" "hinc" "hinca" "hincb"
## [57] "hinch" "hincw" "hisp" "hs" "hu" "incpc" "india"
## [64] "iranc" "irfb" "itanc" "itfb" "japan" "korea" "lep"
## [71] "manuf" "mar" "mex" "mhmval" "mrent" "multi" "n10imm"
## [78] "n65pov" "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov"
## [85] "nhwht" "nnapov" "npov" "ntv" "nwpov" "ohu" "olang"
## [92] "own" "pop" "pr" "prof" "rent" "ruanc" "rufb"
## [99] "scanc" "scfb" "semp" "tractid" "unemp" "vac" "vet"
## [106] "viet" "wds"
## [1] "NOT SHARED:"
## [1] "ag16cv" "ag18cv" "hu00sp" "hu90sp" "ohu00sp" "ohu90sp" "pop90.1"
<- d # keep a copy so you don't have to reload d.full
<- d.full # story original in case you need to reset anything
d
<- select( d, tractid, mhmval90, mhmval00, hinc90,
d
hu90, own90, rent90,
empclf90, clf90, unemp90, prof90,
dpov90, npov90,
ag25up90, hs90, col90,
pop90.x, nhwht90, nhblk90, hisp90, asian90,
cbsa, cbsaname )<-
d %>%
d mutate( p.white = 100 * nhwht90 / pop90.x,
p.black = 100 * nhblk90 / pop90.x,
p.hisp = 100 * hisp90 / pop90.x,
p.asian = 100 * asian90 / pop90.x,
p.hs = 100 * (hs90+col90) / ag25up90,
p.col = 100 * col90 / ag25up90,
p.prof = 100 * prof90 / empclf90,
p.unemp = 100 * unemp90 / clf90,
pov.rate = 100 * npov90 / dpov90 )
The descriptive statistic below shows the summary statistics of all the variables used for analysis.
stargazer( d,
type="html",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhmval90 | 0 | 58,800 | 86,500 | 112,399 | 141,800 | 500,001 |
mhmval00 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
hinc90 | 4,999 | 24,355 | 32,033 | 34,160 | 41,520 | 150,001 |
hu90 | 0 | 916 | 1,337 | 1,382 | 1,804 | 11,003 |
own90 | 0 | 422 | 744 | 788 | 1,093 | 8,180 |
rent90 | 0 | 168 | 358 | 476 | 663 | 8,653 |
empclf90 | 0 | 1,015 | 1,555 | 1,625 | 2,157 | 11,816 |
clf90 | 0 | 1,101 | 1,664 | 1,732 | 2,291 | 12,497 |
unemp90 | 0 | 49 | 88 | 107 | 143 | 1,165 |
prof90 | 0 | 192 | 359 | 451 | 616 | 6,290 |
dpov90 | 0 | 2,219 | 3,250 | 3,332 | 4,365 | 23,619 |
npov90 | 0 | 112 | 248 | 406 | 526 | 5,890 |
ag25up90 | 0 | 1,444 | 2,121 | 2,187 | 2,865 | 18,461 |
hs90 | 0 | 626 | 1,064 | 1,141 | 1,559 | 12,383 |
col90 | 0 | 162 | 335 | 483 | 652 | 8,575 |
pop90.x | 0 | 2,283 | 3,325 | 3,419 | 4,464 | 35,721 |
nhwht90 | 0 | 1,262 | 2,424 | 2,513 | 3,575 | 21,561 |
nhblk90 | 0 | 18 | 76 | 426 | 331 | 12,121 |
hisp90 | 0 | 27 | 75 | 347 | 280 | 13,873 |
asian90 | 0 | 9 | 31 | 111 | 94 | 7,899 |
p.white | 0 | 64 | 87 | 74 | 95 | 100 |
p.black | 0 | 1 | 3 | 12 | 10 | 100 |
p.hisp | 0 | 1 | 3 | 10 | 9 | 100 |
p.asian | 0 | 0 | 1 | 3 | 3 | 94 |
p.hs | 0 | 69 | 74 | 74 | 80 | 100 |
p.col | 0 | 10 | 18 | 22 | 30 | 100 |
p.prof | 0 | 17 | 25 | 27 | 34 | 100 |
p.unemp | 0 | 4 | 5 | 7 | 8 | 64 |
pov.rate | 0 | 4 | 8 | 12 | 16 | 100 |
Initial conditions for data in 1990 and the home value for inflation.
# adjust 2000 home values for inflation
.90 <- d$mhmval90 * 1.32
mhv.00 <- d$mhmval00
mhv
<- mhv.00 - mhv.90
mhv.change
<- data.frame( MedianHomeValue1990=mhv.90,
df MedianHomeValue2000=mhv.00,
Change.90.to.00=mhv.change )
stargazer( df,
type="html",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
MedianHomeValue1990 | 0 | 77,616 | 114,180 | 148,367 | 187,176 | 660,001 |
MedianHomeValue2000 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
Change.90.to.00 | -660,001 | -23,339 | 453 | -3,629 | 19,100 | 1,000,001 |
This histogram shows the distribution of home value changes between 1990 and 2000:
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2000)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 1990 to 2000",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
<- mean( mhv.change/1000, na.rm=T )
mean.x abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
<- median( mhv.change/1000, na.rm=T )
median.x abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
# function to control plot() formatting
<- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
jplot
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=2.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
<- is.finite(x1) & is.finite(x2)
ok lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}
The distributions below compares the distribution of the data raw data between 1990 and 2000:
<- matrix( c( 1,3,
layout.matrix 2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.90/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 1990",
col="darkslateblue", cex=1.8 )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
<- data.frame( v90=mhv.90/1000, v00=mhv.00/1000 )
df <- sample_n( df, 1000 )
df
par( mar=c(4,5,3,2) )
jplot( df$v90, df$v00,
lab1="MHV in 1990", lab2="MHV in 2000",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
The small initial values in the data are skewing the percentages. Assuming and average home value below /$10K is a vacant lot, these data can be removed. Outliers with over 500% change should also be removed to limit data skew.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
.90[ mhv.90 < 10000 ] <- NA
mhv<- mhv.change / mhv.90
pct.change summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.00000 -0.15942 0.00436 0.05019 0.20222 28.32257 145
How many entries show a >500% increase:
# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )
## [1] 26
Preview of the data with large increase (>500%) to see if this makes sense.
# preview tracts with large increases in home values
# to see if increases make sense
%>%
d filter( pct.change > 5 ) %>%
head()
With the outliars removed to minimize skew, we can now accurately represent the percent change in home value between 1990 and 2000.
<-
hg hist( pct.change, breaks=2000,
xlim=c(-1,2), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 1990 to 2000",
col="gray40", border="white" )
axis( side=1, at=seq( from=-1, to=2, by=0.5 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
<- max( hg$count )
ymax
<- mean( pct.change, na.rm=T )
mean.x abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax),
labels=paste0( "Mean = ", round(100*mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
<- median( pct.change, na.rm=T )
median.x abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax),
labels=paste0( "Median = ", round(100*median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
We often want to disagregate descriptive by some grouping in the data, such as metro areas. dplyr makes this easy by grouping then summarizing the data.
$mhv.change <- mhv.change
d$pct.change <- pct.change
d$mhv.00 <- mhv.00
d$mhv.90 <- mhv.90
d
%>%
d group_by( cbsaname ) %>%
summarize( ave.change = median( mhv.change, na.rm=T ),
ave.change.d = dollar( round(ave.change,0) ),
growth = 100 * median( pct.change, na.rm=T ) ) %>%
ungroup() %>%
arrange( - growth ) %>%
select( - ave.change ) %>%
head( 25 ) %>%
pander()
cbsaname | ave.change.d | growth |
---|---|---|
Corvallis, OR | $73,403 | 75.42 |
Portland-Vancouver-Beaverton, OR-WA | $67,900 | 71.8 |
Salt Lake City, UT | $61,005 | 69.02 |
Boulder, CO | $94,414 | 68.56 |
Provo-Orem, UT | $60,131 | 67.84 |
Salem, OR | $48,726 | 61.27 |
Eugene-Springfield, OR | $51,072 | 61.11 |
Fort Collins-Loveland, CO | $57,232 | 56.31 |
Longview, WA | $40,726 | 55.06 |
Missoula, MT | $45,520 | 54.35 |
Jackson, MI | $28,210 | 53.41 |
Greeley, CO | $41,416 | 51.69 |
Wenatchee, WA | $45,676 | 50.45 |
Detroit-Livonia-Dearborn, MI | $28,733 | 49.37 |
Yakima, WA | $30,828 | 49.34 |
Ogden-Clearfield, UT | $42,194 | 48.51 |
Denver-Aurora, CO | $52,010 | 47.94 |
Monroe, MI | $42,004 | 47.38 |
Bay City, MI | $26,654 | 47.04 |
Logan, UT-ID | $35,543 | 46.43 |
Eau Claire, WI | $27,792 | 43.02 |
Madison, WI | $42,897 | 42.14 |
Sioux Falls, SD | $31,025 | 41.11 |
Grand Junction, CO | $34,334 | 40.9 |
Mount Vernon-Anacortes, WA a | $41,944 | 40.61 |
Comparing the median home value between 1990-2000 and 2000-2010. The above analysis shows that 1990-2000 has a median change of $453. Reviewing the tutorial analysis, 2000-2010 has a median change of $36,268. Change in home values have increased by over $35,000 in just ten years!
The average percent change in growth in home value between 1990 and 2000 is only 5% where the average percent change in growth in home value between 2000 and 2010 is 33%!
The original merged dataset we saved as d.full, so we do not need to reload it:
Recall our data steps thus far:
# adjust 2000 home values for inflation
.1990 <- d.full$mhmval90 * 1.32
mhv.00 <- d.full$mhmval00
mhv
<- mhv.00 - mhv.90
mhv.change
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
.90[ mhv.90 < 10000 ] <- NA
mhv<- 100 * ( mhv.change / mhv.90)
pct.change summary( pct.change ) %>% pander()
Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s |
---|---|---|---|---|---|---|
-100 | -15.94 | 0.4362 | 5.019 | 20.22 | 2832 | 145 |
$mhv.90 <- mhv.90
d.full$mhv.00 <- mhv.00
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change d.full
The selection of variables for operationalizing and defining gentrification: We added some variables from 2000:
head( vars )
<- select( d.full,
d3
# ids / units of analysis
tractid, cbsa, cbsaname,
.90, mhv.00, mhv.change, pct.change, # home value
mhv
# ses
hinc90, hu90, own90, rent90,
hinc00, hu00, own00, rent00,
# employment
empclf90, clf90, unemp90, prof90,
empclf00, clf00, unemp00, prof00,
# poverty
dpov90, npov90,
dpov00, npov00,
# education
ag25up90, hs90, col90,
ag25up00, hs00, col00,
# race
pop90.x, nhwht90, nhblk90, hisp90, asian90,
pop00.x, nhwht00, nhblk00, hisp00, asian00
# end select )
<-
d3 %>%
d3 mutate(
# 1990 variables
p.white.90 = 100 * nhwht90 / pop90.x,
p.black.90 = 100 * nhblk90 / pop90.x,
p.hisp.90 = 100 * hisp90 / pop90.x,
p.asian.90 = 100 * asian90 / pop90.x,
p.hs.edu.90 = 100 * (hs90+col90) / ag25up90,
p.col.edu.90 = 100 * col90 / ag25up90,
p.prof.90 = 100 * prof90 / empclf90,
p.unemp.90 = 100 * unemp90 / clf90,
pov.rate.90 = 100 * npov90 / dpov90,
# 2000 variables
p.white.00 = 100 * nhwht00 / pop00.x,
p.black.00 = 100 * nhblk00 / pop00.x,
p.hisp.00 = 100 * hisp00 / pop00.x,
p.asian.00 = 100 * asian00 / pop00.x,
p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
p.col.edu.00 = 100 * col00 / ag25up00,
p.prof.00 = 100 * prof00 / empclf00,
p.unemp.00 = 100 * unemp00 / clf00,
pov.rate.00 = 100 * npov00 / dpov00 )
<-
d3 %>%
d3 group_by( cbsaname ) %>%
mutate( metro.mhv.pct.90 = ntile( mhv.90, 100 ),
metro.mhv.pct.00 = ntile( mhv.00, 100 ),
metro.median.pay.90 = median( hinc90, na.rm=T ),
metro.median.pay.00 = median( hinc00, na.rm=T ),
metro.race.rank.90 = ntile( (100-p.white.90), 100 ) ) %>%
ungroup() %>%
mutate( metro.mhv.pct.change = metro.mhv.pct.00 - metro.mhv.pct.90,
pay.change = metro.median.pay.00 - metro.median.pay.90,
race.change = p.white.00 - p.white.90,
mhv.change = mhv.00 - mhv.90 )
The table below describes the summary statistics for the selected change variables.
<-
d3 %>%
d3 select( c( "tractid", "cbsa", "cbsaname",
"mhv.90", "mhv.00", "mhv.change","pct.change",
"p.white.90", "p.black.90", "p.hisp.90", "p.asian.90",
"p.hs.edu.90", "p.col.edu.90", "p.prof.90", "p.unemp.90",
"pov.rate.90", "p.white.00", "p.black.00", "p.hisp.00",
"p.asian.00", "p.hs.edu.00", "p.col.edu.00", "p.prof.00",
"p.unemp.00", "pov.rate.00", "metro.mhv.pct.90",
"metro.mhv.pct.00", "metro.median.pay.90", "metro.median.pay.00",
"metro.mhv.pct.change", "pay.change", "race.change",
"metro.race.rank.90") )
# head( d3 ) %>% pander()
<- data.frame(d3)
d3 stargazer( d3,
type="html",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhv.90 | 11,712 | 77,880 | 114,312 | 148,728 | 187,440 | 660,001 |
mhv.00 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
mhv.change | -660,001 | -23,448 | 396 | -4,122 | 18,968 | 963,701 |
pct.change | -100 | -16 | 0 | 5 | 20 | 2,832 |
p.white.90 | 0 | 64 | 87 | 74 | 95 | 100 |
p.black.90 | 0 | 1 | 3 | 12 | 10 | 100 |
p.hisp.90 | 0 | 1 | 3 | 10 | 9 | 100 |
p.asian.90 | 0 | 0 | 1 | 3 | 3 | 94 |
p.hs.edu.90 | 0 | 69 | 74 | 74 | 80 | 100 |
p.col.edu.90 | 0 | 10 | 18 | 22 | 30 | 100 |
p.prof.90 | 0 | 17 | 25 | 27 | 34 | 100 |
p.unemp.90 | 0 | 4 | 5 | 7 | 8 | 64 |
pov.rate.90 | 0 | 4 | 8 | 12 | 16 | 100 |
p.white.00 | 0 | 47 | 78 | 67 | 91 | 100 |
p.black.00 | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp.00 | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian.00 | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs.edu.00 | 0 | 67 | 72 | 72 | 77 | 100 |
p.col.edu.00 | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof.00 | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp.00 | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate.00 | 0 | 4 | 9 | 12 | 17 | 100 |
metro.mhv.pct.90 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.mhv.pct.00 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.median.pay.90 | 14,871 | 28,906 | 32,457 | 32,924 | 35,833 | 52,374 |
metro.median.pay.00 | 23,012 | 39,457 | 43,139 | 45,054 | 49,522 | 73,701 |
metro.mhv.pct.change | -99 | -5 | 0 | 0 | 6 | 99 |
pay.change | 4,930 | 9,775 | 11,441 | 12,130 | 14,001 | 26,211 |
race.change | -100 | -12 | -5 | -8 | -2 | 100 |
metro.race.rank.90 | 1 | 20 | 41 | 45 | 68 | 100 |
# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?
# home value in lower than average home in a metro in 2000
.1990 <- d3$metro.mhv.pct.90 < 50
poor
# above average diversity for metro
.1990 <- d3$metro.race.rank.90 > 50
diverse
# home values increased more than overall city gains
# change in percentile rank within the metro
<- d3$metro.mhv.pct.change > 0
mhv.pct.increase
# faster than average growth
# 25% growth in value is median for the country
<- d3$pct.change > 25
home.val.rise
# proportion of whites increases by more than 3 percent
# measured by increase in white
<- d3$race.change > 3
loss.diversity
<- poor.1990 & diverse.1990 & mhv.pct.increase & home.val.rise & loss.diversity
g.flag
<- sum( poor.1990 & diverse.1990, na.rm=T )
num.candidates <- sum( g.flag, na.rm=T ) num.gentrified
num.gentrified
## [1] 371
num.candidates
## [1] 17560
/ num.candidates num.gentrified
## [1] 0.02112756
By the above definition, based on the analysis only 2.1 percent of urban tracts experience gentrification between 1990 and 2000 as compared to 5.7 percent from 2000 to 2010.
17560 of census tracts are candidates, and 371 transitioned into advanced stage of gentrification.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
.90[ mhv.90 < 1000 ] <- NA
mhv<- 100 * ( mhv.change / mhv.90 )
pct.change summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.0000 -15.9418 0.4362 5.0194 20.2215 2832.2571 145
The dorling cartogram shows the census data and groups from clustering.
# DATA STEPS
# load dorling cartogram from github
# map already contains census data and groups from clustering
<- "https://raw.githubusercontent.com/WSKQ23/william-project/main/pittt_dorling.geojson"
github.url <- geojson_read( x=github.url, what="sp" )
pit
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit )
## [1] 627
plot(pit)
# create small dataframe for the merge
<- data.frame( tractid=d$tractid,
df .90, mhv.00, mhv.change, pct.change )
mhv
# create GEOID that matches GIS format
# create a geoID for merging by tract
$GEOID <- substr( df$tractid, 6, 18 ) # extract codes
df$GEOID <- gsub( "-", "", df$GEOID ) # remove hyphens
dfclass( df$GEOID )
## [1] "character"
head( pit@data )
<- merge( pit, df, by.x="GEOID", by.y="GEOID" )
pit
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit )
## [1] 627
<- spTransform( pit, CRS("+init=epsg:3395") )
pit
<- st_bbox( c( xmin = -9100000, xmax = -8700000,
bb ymax = 4800000, ymin = 5000000 ),
crs = st_crs("+init=epsg:3395"))
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
tm_shape( pit, bbox=bb ) +
tm_polygons( col="mhv.90", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
The cartogram above shows candidates for gentrification in 1990 disaggregated by mean home value. The map shows a higher median home value in the North-West region of the city and lower median home values toward the center.
From 2000:
<- spTransform( pit, CRS("+init=epsg:3395") ) pit
## Warning: PROJ support is provided by the sf and terra packages among others
<- st_bbox( c( xmin = -9100000, xmax = -8700000,
bb ymax = 4800000, ymin = 5000000 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( pit, bbox=bb ) +
tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
The cartogram above shows candidates for gentrification in 2000 disaggregated by mean home value. The map shows a higher median home value in the North and South-West region of the city and lower median home values toward the center.
tm_shape( pit, bbox=bb ) +
tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
The cartogram above shows the distribution of gentrification across the city between 1990 and 2000. The map shows that change in the median home value happens more toward the north and north-west areas of Pittsburgh.
tm_shape( pit, bbox=bb ) +
tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
The cartogram above shows that the percent change in median home value as a result of the distribution of gentrification happens in the far northern area of Pittsburgh.