This tutorial contains the steps the team took to create a data subset of the LTDB files and conduct descriptive analysis of that subset.

library( here )
library( data.table )
library( dplyr )
library( pander )
library( tidyverse )
library( readr )
library( forecast )
library( knitr )
library( stargazer )
library( scales )

library( geojsonio )   
library( sp )          
library( sf )          
library( mclust )       
library( tmap )        
library( ggplot2 )      
library( ggthemes )    
library( dplyr )       
library( pander )      
library( tidycensus )
library( cartogram )  
library( maptools )
library( corrplot )

s.type <- "text"

Data set use for analysis

In order to determine changes in home value differ between 1990 and 2000, a 1990 to 2000 (ten years) data sets were used.

d1 <- readRDS( here::here( "data/rodeo/LTDB-1990.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure you are not losing or gaining observations in the merge

nrow( d1 )
## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )
## [1] 72693

Filter Urban Districts

table( d$urban )
## 
## rural urban 
## 12971 59722
d <- filter( d, urban == "urban" )

Identify Common Variables

To enhance proper analysis of the data sets, common variables need to be identified for 1990 and 2000 data sets:

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv"  "agewht" 
##  [22] "asian"   "china"   "clf"     "cni16u"  "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dis"     "dmulti"  "dnapov" 
##  [36] "dpov"    "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"  
##  [43] "flabf"   "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"     
##  [50] "hha"     "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"  
##  [57] "hinch"   "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"  
##  [64] "iranc"   "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"    
##  [71] "manuf"   "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm" 
##  [78] "n65pov"  "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"  
##  [85] "nhwht"   "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"  
##  [92] "own"     "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"   
##  [99] "scanc"   "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"    
## [106] "viet"    "wds"    
## [1] "NOT SHARED:"
## [1] "ag16cv"  "ag18cv"  "hu00sp"  "hu90sp"  "ohu00sp" "ohu90sp" "pop90.1"
d.full <- d  # keep a copy so you don't have to reload 
d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval90, mhmval00, hinc90, 
             hu90, own90, rent90,  
             empclf90, clf90, unemp90, prof90,  
             dpov90, npov90,
             ag25up90, hs90, col90, 
             pop90.x, nhwht90, nhblk90, hisp90, asian90,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht90 / pop90.x,
          p.black = 100 * nhblk90 / pop90.x,
          p.hisp = 100 * hisp90 / pop90.x, 
          p.asian = 100 * asian90 / pop90.x,
          p.hs = 100 * (hs90+col90) / ag25up90,
          p.col = 100 * col90 / ag25up90,
          p.prof = 100 * prof90 / empclf90,
          p.unemp = 100 * unemp90 / clf90,
          pov.rate = 100 * npov90 / dpov90 )

Table of descriptive statistics

The descriptive statistic below shows the summary statistics of all the variables used for analysis.

stargazer( d, 
           type="html", 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
mhmval90 0 58,800 86,500 112,399 141,800 500,001
mhmval00 0 81,600 119,900 144,738 173,894 1,000,001
hinc90 4,999 24,355 32,033 34,160 41,520 150,001
hu90 0 916 1,337 1,382 1,804 11,003
own90 0 422 744 788 1,093 8,180
rent90 0 168 358 476 663 8,653
empclf90 0 1,015 1,555 1,625 2,157 11,816
clf90 0 1,101 1,664 1,732 2,291 12,497
unemp90 0 49 88 107 143 1,165
prof90 0 192 359 451 616 6,290
dpov90 0 2,219 3,250 3,332 4,365 23,619
npov90 0 112 248 406 526 5,890
ag25up90 0 1,444 2,121 2,187 2,865 18,461
hs90 0 626 1,064 1,141 1,559 12,383
col90 0 162 335 483 652 8,575
pop90.x 0 2,283 3,325 3,419 4,464 35,721
nhwht90 0 1,262 2,424 2,513 3,575 21,561
nhblk90 0 18 76 426 331 12,121
hisp90 0 27 75 347 280 13,873
asian90 0 9 31 111 94 7,899
p.white 0 64 87 74 95 100
p.black 0 1 3 12 10 100
p.hisp 0 1 3 10 9 100
p.asian 0 0 1 3 3 94
p.hs 0 69 74 74 80 100
p.col 0 10 18 22 30 100
p.prof 0 17 25 27 34 100
p.unemp 0 4 5 7 8 64
pov.rate 0 4 8 12 16 100

Exploration of Median Home Value

Initial conditions for data in 1990 and the home value for inflation.

# adjust 2000 home values for inflation 
mhv.90 <- d$mhmval90 * 1.32  
mhv.00 <- d$mhmval00

mhv.change <- mhv.00 - mhv.90

df <- data.frame( MedianHomeValue1990=mhv.90, 
                  MedianHomeValue2000=mhv.00, 
                  Change.90.to.00=mhv.change )

stargazer( df, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
MedianHomeValue1990 0 77,616 114,180 148,367 187,176 660,001
MedianHomeValue2000 0 81,600 119,900 144,738 173,894 1,000,001
Change.90.to.00 -660,001 -23,339 453 -3,629 19,100 1,000,001

Histogram of MHV

This histogram shows the distribution of home value changes between 1990 and 2000:

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2000)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 1990 to 2000",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

# function to control plot() formatting 

jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=2.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

The distributions below compares the distribution of the data raw data between 1990 and 2000:

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.90/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 1990", 
      col="darkslateblue", cex=1.8 )



hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v90=mhv.90/1000, v00=mhv.00/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v90, df$v00, 
       lab1="MHV in 1990", lab2="MHV in 2000",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )

Change in MHV 1990-2000

The small initial values in the data are skewing the percentages. Assuming and average home value below /$10K is a vacant lot, these data can be removed. Outliers with over 500% change should also be removed to limit data skew.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- mhv.change / mhv.90
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -1.00000 -0.15942  0.00436  0.05019  0.20222 28.32257      145

How many entries show a >500% increase:

# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )
## [1] 26

Preview of the data with large increase (>500%) to see if this makes sense.

# preview tracts with large increases in home values 
# to see if increases make sense 

d %>% 
  filter( pct.change > 5 ) %>% 
  head()

Plot the percent change variable:

With the outliars removed to minimize skew, we can now accurately represent the percent change in home value between 1990 and 2000.

hg <-
hist( pct.change, breaks=2000, 
      xlim=c(-1,2), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 1990 to 2000",
      col="gray40", border="white" )

axis( side=1, at=seq( from=-1, to=2, by=0.5 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(100*mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(100*median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Group Growth Rates By Metro Area

We often want to disagregate descriptive by some grouping in the data, such as metro areas. dplyr makes this easy by grouping then summarizing the data.

d$mhv.change <- mhv.change 
d$pct.change <- pct.change
d$mhv.00 <- mhv.00
d$mhv.90 <- mhv.90

d %>%
  group_by( cbsaname ) %>%
  summarize( ave.change = median( mhv.change, na.rm=T ),
             ave.change.d = dollar( round(ave.change,0) ),
             growth = 100 * median( pct.change, na.rm=T ) ) %>%
  ungroup() %>%
  arrange( - growth ) %>%
  select( - ave.change ) %>% 
  head( 25 ) %>%
  pander()
cbsaname ave.change.d growth
Corvallis, OR $73,403 75.42
Portland-Vancouver-Beaverton, OR-WA $67,900 71.8
Salt Lake City, UT $61,005 69.02
Boulder, CO $94,414 68.56
Provo-Orem, UT $60,131 67.84
Salem, OR $48,726 61.27
Eugene-Springfield, OR $51,072 61.11
Fort Collins-Loveland, CO $57,232 56.31
Longview, WA $40,726 55.06
Missoula, MT $45,520 54.35
Jackson, MI $28,210 53.41
Greeley, CO $41,416 51.69
Wenatchee, WA $45,676 50.45
Detroit-Livonia-Dearborn, MI $28,733 49.37
Yakima, WA $30,828 49.34
Ogden-Clearfield, UT $42,194 48.51
Denver-Aurora, CO $52,010 47.94
Monroe, MI $42,004 47.38
Bay City, MI $26,654 47.04
Logan, UT-ID $35,543 46.43
Eau Claire, WI $27,792 43.02
Madison, WI $42,897 42.14
Sioux Falls, SD $31,025 41.11
Grand Junction, CO $34,334 40.9
Mount Vernon-Anacortes, WA a $41,944 40.61

Comparing the median home value between 1990-2000 and 2000-2010. The above analysis shows that 1990-2000 has a median change of $453. Reviewing the tutorial analysis, 2000-2010 has a median change of $36,268. Change in home values have increased by over $35,000 in just ten years!

The average percent change in growth in home value between 1990 and 2000 is only 5% where the average percent change in growth in home value between 2000 and 2010 is 33%!

Part II

Measuring Gentrification

The original merged dataset we saved as d.full, so we do not need to reload it:

Recall our data steps thus far:

# adjust 2000 home values for inflation 
mhv.1990 <- d.full$mhmval90 * 1.32  
mhv.00 <- d.full$mhmval00

mhv.change <- mhv.00 - mhv.90

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90)
summary( pct.change ) %>% pander()
Min. 1st Qu. Median Mean 3rd Qu. Max. NA’s
-100 -15.94 0.4362 5.019 20.22 2832 145
d.full$mhv.90 <- mhv.90
d.full$mhv.00 <- mhv.00
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change

Selection of Gentrification Variables

The selection of variables for operationalizing and defining gentrification: We added some variables from 2000:

head( vars )
d3 <- select( d.full, 
             
             tractid, cbsa, cbsaname,            # ids / units of analysis
             
             mhv.90, mhv.00, mhv.change, pct.change,    # home value 
             
             hinc90, hu90, own90, rent90,        # ses
             hinc00, hu00, own00, rent00,
             
             empclf90, clf90, unemp90, prof90,   # employment 
             empclf00, clf00, unemp00, prof00,
             
             dpov90, npov90,                     # poverty
             dpov00, npov00,
             
             ag25up90, hs90, col90,              # education 
             ag25up00, hs00, col00,
             
             pop90.x, nhwht90, nhblk90, hisp90, asian90,   # race
             pop00.x, nhwht00, nhblk00, hisp00, asian00
             
          ) # end select
d3 <- 
  d3 %>%
  mutate( 
          # 1990 variables
          p.white.90 = 100 * nhwht90 / pop90.x,
          p.black.90 = 100 * nhblk90 / pop90.x,
          p.hisp.90 = 100 * hisp90 / pop90.x, 
          p.asian.90 = 100 * asian90 / pop90.x,
          p.hs.edu.90 = 100 * (hs90+col90) / ag25up90,
          p.col.edu.90 = 100 * col90 / ag25up90,
          p.prof.90 = 100 * prof90 / empclf90,
          p.unemp.90 = 100 * unemp90 / clf90,
          pov.rate.90 = 100 * npov90 / dpov90,
          
          # 2000 variables
          p.white.00 = 100 * nhwht00 / pop00.x,
          p.black.00 = 100 * nhblk00 / pop00.x,
          p.hisp.00 = 100 * hisp00 / pop00.x, 
          p.asian.00 = 100 * asian00 / pop00.x,
          p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
          p.col.edu.00 = 100 * col00 / ag25up00,
          p.prof.00 = 100 * prof00 / empclf00,
          p.unemp.00 = 100 * unemp00 / clf00,
          pov.rate.00 = 100 * npov00 / dpov00 )
d3 <-
  d3 %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.pct.90 = ntile( mhv.90, 100 ),
          metro.mhv.pct.00 = ntile( mhv.00, 100 ),
          metro.median.pay.90 = median( hinc90, na.rm=T ),
          metro.median.pay.00 = median( hinc00, na.rm=T ),
          metro.race.rank.90 = ntile( (100-p.white.90), 100 ) ) %>%
  ungroup() %>%
  mutate( metro.mhv.pct.change = metro.mhv.pct.00 - metro.mhv.pct.90,
          pay.change = metro.median.pay.00 - metro.median.pay.90,
          race.change = p.white.00 - p.white.90,
          mhv.change = mhv.00 - mhv.90 )

Descriptive Statistics of Change Variables

The table below describes the summary statistics for the selected change variables.

d3 <-           
  d3 %>%
  select( c( "tractid", "cbsa", "cbsaname",
             "mhv.90", "mhv.00", "mhv.change","pct.change",
          "p.white.90", "p.black.90", "p.hisp.90", "p.asian.90", 
          "p.hs.edu.90", "p.col.edu.90", "p.prof.90",  "p.unemp.90", 
          "pov.rate.90", "p.white.00", "p.black.00", "p.hisp.00", 
          "p.asian.00", "p.hs.edu.00", "p.col.edu.00", "p.prof.00", 
          "p.unemp.00", "pov.rate.00", "metro.mhv.pct.90", 
          "metro.mhv.pct.00", "metro.median.pay.90", "metro.median.pay.00", 
          "metro.mhv.pct.change", "pay.change", "race.change",
          "metro.race.rank.90") ) 
  
# head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
mhv.90 11,712 77,880 114,312 148,728 187,440 660,001
mhv.00 0 81,600 119,900 144,738 173,894 1,000,001
mhv.change -660,001 -23,448 396 -4,122 18,968 963,701
pct.change -100 -16 0 5 20 2,832
p.white.90 0 64 87 74 95 100
p.black.90 0 1 3 12 10 100
p.hisp.90 0 1 3 10 9 100
p.asian.90 0 0 1 3 3 94
p.hs.edu.90 0 69 74 74 80 100
p.col.edu.90 0 10 18 22 30 100
p.prof.90 0 17 25 27 34 100
p.unemp.90 0 4 5 7 8 64
pov.rate.90 0 4 8 12 16 100
p.white.00 0 47 78 67 91 100
p.black.00 0 1 4 14 14 100
p.hisp.00 0 2 4 13 15 100
p.asian.00 0 1 2 5 5 95
p.hs.edu.00 0 67 72 72 77 100
p.col.edu.00 0 12 21 26 36 100
p.prof.00 0 23 31 34 43 100
p.unemp.00 0 3 5 6 8 100
pov.rate.00 0 4 9 12 17 100
metro.mhv.pct.90 1 20 41 45 68 100
metro.mhv.pct.00 1 20 41 45 68 100
metro.median.pay.90 14,871 28,906 32,457 32,924 35,833 52,374
metro.median.pay.00 23,012 39,457 43,139 45,054 49,522 73,701
metro.mhv.pct.change -99 -5 0 0 6 99
pay.change 4,930 9,775 11,441 12,130 14,001 26,211
race.change -100 -12 -5 -8 -2 100
metro.race.rank.90 1 20 41 45 68 100

Operationalizing Gentrification

# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?


# home value in lower than average home in a metro in 2000
poor.1990 <- d3$metro.mhv.pct.90 < 50  

# above average diversity for metro
diverse.1990 <- d3$metro.race.rank.90 > 50 

# home values increased more than overall city gains 
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0

# faster than average growth  
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25 

# proportion of whites increases by more than 3 percent 
# measured by increase in white
loss.diversity <- d3$race.change > 3 

g.flag <- poor.1990 & diverse.1990 & mhv.pct.increase & home.val.rise & loss.diversity

num.candidates <-  sum( poor.1990 & diverse.1990, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )
num.gentrified 
## [1] 371
num.candidates
## [1] 17560
num.gentrified / num.candidates
## [1] 0.02112756

By the above definition, based on the analysis only 2.1 percent of urban tracts experience gentrification between 1990 and 2000 as compared to 5.7 percent from 2000 to 2010.

17560 of census tracts are candidates, and 371 transitioned into advanced stage of gentrification.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -100.0000  -15.9418    0.4362    5.0194   20.2215 2832.2571       145

Part III

The dorling cartogram shows the census data and groups from clustering.

# DATA STEPS 

# load dorling cartogram from github
# map already contains census data and groups from clustering 

github.url <- "https://raw.githubusercontent.com/WSKQ23/william-project/main/pittt_dorling.geojson"
pit <- geojson_read( x=github.url,  what="sp" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit )
## [1] 627
plot(pit)

# create small dataframe for the merge
df <- data.frame(  tractid=d$tractid, 
        mhv.90,  mhv.00,  mhv.change,  pct.change  )

# create GEOID that matches GIS format

# create a geoID for merging by tract 
df$GEOID <- substr( df$tractid, 6, 18 )  # extract codes
df$GEOID <- gsub( "-", "", df$GEOID )    # remove hyphens
class( df$GEOID )
## [1] "character"
head( pit@data )
pit <- merge( pit, df, by.x="GEOID", by.y="GEOID" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit ) 
## [1] 627
pit <- spTransform( pit, CRS("+init=epsg:3395") )

bb <- st_bbox( c( xmin =  -9100000, xmax = -8700000, 
                  ymax = 4800000, ymin = 5000000 ), 
               crs = st_crs("+init=epsg:3395")) 
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.90", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows candidates for gentrification in 1990 disaggregated by mean home value. The map shows a higher median home value in the North-West region of the city and lower median home values toward the center.

From 2000:

pit <- spTransform( pit, CRS("+init=epsg:3395") )
## Warning: PROJ support is provided by the sf and terra packages among others
bb <- st_bbox( c( xmin =  -9100000, xmax = -8700000, 
                  ymax = 4800000, ymin = 5000000 ), 
               crs = st_crs("+init=epsg:3395")) 

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows candidates for gentrification in 2000 disaggregated by mean home value. The map shows a higher median home value in the North and South-West region of the city and lower median home values toward the center.

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows the distribution of gentrification across the city between 1990 and 2000. The map shows that change in the median home value happens more toward the north and north-west areas of Pittsburgh.

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows that the percent change in median home value as a result of the distribution of gentrification happens in the far northern area of Pittsburgh.