This tutorial contains the steps the team took to create a data subset of the LTDB files and conduct descriptive analysis of that subset.

library( here )
library( data.table )
library( dplyr )
library( pander )
library( tidyverse )
library( readr )
library( forecast )
library( knitr )
library( stargazer )
library( scales )

library( geojsonio )   
library( sp )          
library( sf )          
library( mclust )       
library( tmap )        
library( ggplot2 )      
library( ggthemes )    
library( dplyr )       
library( pander )      
library( tidycensus )
library( cartogram )  
library( maptools )
library( corrplot )

s.type <- "text"

Data set use for analysis

In order to determine changes in home value differ between 1990 and 2000, a 1990 to 2000 (ten years) data sets were used.

d1 <- readRDS( here::here( "data/rodeo/LTDB-1990.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure you are not losing or gaining observations in the merge

nrow( d1 )

## [1] 72693

d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )

## [1] 72693

Filter Urban Districts

table( d$urban )

## 
## rural urban 
## 12971 59722

d <- filter( d, urban == "urban" )

Identify Common Variables

To enhance proper analysis of the data sets, common variables need to be identified for 1990 and 2000 data sets:

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )

## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv"  "agewht" 
##  [22] "asian"   "china"   "clf"     "cni16u"  "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dis"     "dmulti"  "dnapov" 
##  [36] "dpov"    "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"  
##  [43] "flabf"   "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"     
##  [50] "hha"     "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"  
##  [57] "hinch"   "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"  
##  [64] "iranc"   "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"    
##  [71] "manuf"   "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm" 
##  [78] "n65pov"  "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"  
##  [85] "nhwht"   "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"  
##  [92] "own"     "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"   
##  [99] "scanc"   "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"    
## [106] "viet"    "wds"    
## [1] "NOT SHARED:"
## [1] "ag16cv"  "ag18cv"  "hu00sp"  "hu90sp"  "ohu00sp" "ohu90sp" "pop90.1"

d.full <- d  # keep a copy so you don't have to reload

d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval90, mhmval00, hinc90, 
             hu90, own90, rent90,  
             empclf90, clf90, unemp90, prof90,  
             dpov90, npov90,
             ag25up90, hs90, col90, 
             pop90.x, nhwht90, nhblk90, hisp90, asian90,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht90 / pop90.x,
          p.black = 100 * nhblk90 / pop90.x,
          p.hisp = 100 * hisp90 / pop90.x, 
          p.asian = 100 * asian90 / pop90.x,
          p.hs = 100 * (hs90+col90) / ag25up90,
          p.col = 100 * col90 / ag25up90,
          p.prof = 100 * prof90 / empclf90,
          p.unemp = 100 * unemp90 / clf90,
          pov.rate = 100 * npov90 / dpov90 )

Table of descriptive statistics

The descriptive statistic below shows the summary statistics of all the variables used for analysis.

stargazer( d, 
           type="html", 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

mhmval90	0	58,800	86,500	112,399	141,800	500,001
mhmval00	0	81,600	119,900	144,738	173,894	1,000,001
hinc90	4,999	24,355	32,033	34,160	41,520	150,001
hu90	0	916	1,337	1,382	1,804	11,003
own90	0	422	744	788	1,093	8,180
rent90	0	168	358	476	663	8,653
empclf90	0	1,015	1,555	1,625	2,157	11,816
clf90	0	1,101	1,664	1,732	2,291	12,497
unemp90	0	49	88	107	143	1,165
prof90	0	192	359	451	616	6,290
dpov90	0	2,219	3,250	3,332	4,365	23,619
npov90	0	112	248	406	526	5,890
ag25up90	0	1,444	2,121	2,187	2,865	18,461
hs90	0	626	1,064	1,141	1,559	12,383
col90	0	162	335	483	652	8,575
pop90.x	0	2,283	3,325	3,419	4,464	35,721
nhwht90	0	1,262	2,424	2,513	3,575	21,561
nhblk90	0	18	76	426	331	12,121
hisp90	0	27	75	347	280	13,873
asian90	0	9	31	111	94	7,899
p.white	0	64	87	74	95	100
p.black	0	1	3	12	10	100
p.hisp	0	1	3	10	9	100
p.asian	0	0	1	3	3	94
p.hs	0	69	74	74	80	100
p.col	0	10	18	22	30	100
p.prof	0	17	25	27	34	100
p.unemp	0	4	5	7	8	64
pov.rate	0	4	8	12	16	100

Exploration of Median Home Value

Initial conditions for data in 1990 and the home value for inflation.

# adjust 2000 home values for inflation 
mhv.90 <- d$mhmval90 * 1.32  
mhv.00 <- d$mhmval00

mhv.change <- mhv.00 - mhv.90

df <- data.frame( MedianHomeValue1990=mhv.90, 
                  MedianHomeValue2000=mhv.00, 
                  Change.90.to.00=mhv.change )

stargazer( df, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue1990	0	77,616	114,180	148,367	187,176	660,001
MedianHomeValue2000	0	81,600	119,900	144,738	173,894	1,000,001
Change.90.to.00	-660,001	-23,339	453	-3,629	19,100	1,000,001

Histogram of MHV

This histogram shows the distribution of home value changes between 1990 and 2000:

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2000)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 1990 to 2000",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

# function to control plot() formatting 

jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=2.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

The distributions below compares the distribution of the data raw data between 1990 and 2000:

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.90/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 1990", 
      col="darkslateblue", cex=1.8 )



hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v90=mhv.90/1000, v00=mhv.00/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v90, df$v00, 
       lab1="MHV in 1990", lab2="MHV in 2000",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )

Change in MHV 1990-2000

The small initial values in the data are skewing the percentages. Assuming and average home value below /$10K is a vacant lot, these data can be removed. Outliers with over 500% change should also be removed to limit data skew.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- mhv.change / mhv.90
summary( pct.change )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -1.00000 -0.15942  0.00436  0.05019  0.20222 28.32257      145

How many entries show a >500% increase:

# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )

## [1] 26

Preview of the data with large increase (>500%) to see if this makes sense.

# preview tracts with large increases in home values 
# to see if increases make sense 

d %>% 
  filter( pct.change > 5 ) %>% 
  head()

Plot the percent change variable:

With the outliars removed to minimize skew, we can now accurately represent the percent change in home value between 1990 and 2000.

hg <-
hist( pct.change, breaks=2000, 
      xlim=c(-1,2), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 1990 to 2000",
      col="gray40", border="white" )

axis( side=1, at=seq( from=-1, to=2, by=0.5 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(100*mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(100*median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Group Growth Rates By Metro Area

We often want to disagregate descriptive by some grouping in the data, such as metro areas. dplyr makes this easy by grouping then summarizing the data.

d$mhv.change <- mhv.change 
d$pct.change <- pct.change
d$mhv.00 <- mhv.00
d$mhv.90 <- mhv.90

d %>%
  group_by( cbsaname ) %>%
  summarize( ave.change = median( mhv.change, na.rm=T ),
             ave.change.d = dollar( round(ave.change,0) ),
             growth = 100 * median( pct.change, na.rm=T ) ) %>%
  ungroup() %>%
  arrange( - growth ) %>%
  select( - ave.change ) %>% 
  head( 25 ) %>%
  pander()

cbsaname	ave.change.d	growth
Corvallis, OR	$73,403	75.42
Portland-Vancouver-Beaverton, OR-WA	$67,900	71.8
Salt Lake City, UT	$61,005	69.02
Boulder, CO	$94,414	68.56
Provo-Orem, UT	$60,131	67.84
Salem, OR	$48,726	61.27
Eugene-Springfield, OR	$51,072	61.11
Fort Collins-Loveland, CO	$57,232	56.31
Longview, WA	$40,726	55.06
Missoula, MT	$45,520	54.35
Jackson, MI	$28,210	53.41
Greeley, CO	$41,416	51.69
Wenatchee, WA	$45,676	50.45
Detroit-Livonia-Dearborn, MI	$28,733	49.37
Yakima, WA	$30,828	49.34
Ogden-Clearfield, UT	$42,194	48.51
Denver-Aurora, CO	$52,010	47.94
Monroe, MI	$42,004	47.38
Bay City, MI	$26,654	47.04
Logan, UT-ID	$35,543	46.43
Eau Claire, WI	$27,792	43.02
Madison, WI	$42,897	42.14
Sioux Falls, SD	$31,025	41.11
Grand Junction, CO	$34,334	40.9
Mount Vernon-Anacortes, WA a	$41,944	40.61

Comparing the median home value between 1990-2000 and 2000-2010. The above analysis shows that 1990-2000 has a median change of $453. Reviewing the tutorial analysis, 2000-2010 has a median change of $36,268. Change in home values have increased by over $35,000 in just ten years!

The average percent change in growth in home value between 1990 and 2000 is only 5% where the average percent change in growth in home value between 2000 and 2010 is 33%!

Part II

Measuring Gentrification

The original merged dataset we saved as d.full, so we do not need to reload it:

Recall our data steps thus far:

# adjust 2000 home values for inflation 
mhv.1990 <- d.full$mhmval90 * 1.32  
mhv.00 <- d.full$mhmval00

mhv.change <- mhv.00 - mhv.90

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90)
summary( pct.change ) %>% pander()

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.	NA’s
-100	-15.94	0.4362	5.019	20.22	2832	145

d.full$mhv.90 <- mhv.90
d.full$mhv.00 <- mhv.00
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change

Selection of Gentrification Variables

The selection of variables for operationalizing and defining gentrification: We added some variables from 2000:

head( vars )

d3 <- select( d.full, 
             
             tractid, cbsa, cbsaname,            # ids / units of analysis
             
             mhv.90, mhv.00, mhv.change, pct.change,    # home value 
             
             hinc90, hu90, own90, rent90,        # ses
             hinc00, hu00, own00, rent00,
             
             empclf90, clf90, unemp90, prof90,   # employment 
             empclf00, clf00, unemp00, prof00,
             
             dpov90, npov90,                     # poverty
             dpov00, npov00,
             
             ag25up90, hs90, col90,              # education 
             ag25up00, hs00, col00,
             
             pop90.x, nhwht90, nhblk90, hisp90, asian90,   # race
             pop00.x, nhwht00, nhblk00, hisp00, asian00
             
          ) # end select

d3 <- 
  d3 %>%
  mutate( 
          # 1990 variables
          p.white.90 = 100 * nhwht90 / pop90.x,
          p.black.90 = 100 * nhblk90 / pop90.x,
          p.hisp.90 = 100 * hisp90 / pop90.x, 
          p.asian.90 = 100 * asian90 / pop90.x,
          p.hs.edu.90 = 100 * (hs90+col90) / ag25up90,
          p.col.edu.90 = 100 * col90 / ag25up90,
          p.prof.90 = 100 * prof90 / empclf90,
          p.unemp.90 = 100 * unemp90 / clf90,
          pov.rate.90 = 100 * npov90 / dpov90,
          
          # 2000 variables
          p.white.00 = 100 * nhwht00 / pop00.x,
          p.black.00 = 100 * nhblk00 / pop00.x,
          p.hisp.00 = 100 * hisp00 / pop00.x, 
          p.asian.00 = 100 * asian00 / pop00.x,
          p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
          p.col.edu.00 = 100 * col00 / ag25up00,
          p.prof.00 = 100 * prof00 / empclf00,
          p.unemp.00 = 100 * unemp00 / clf00,
          pov.rate.00 = 100 * npov00 / dpov00 )

d3 <-
  d3 %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.pct.90 = ntile( mhv.90, 100 ),
          metro.mhv.pct.00 = ntile( mhv.00, 100 ),
          metro.median.pay.90 = median( hinc90, na.rm=T ),
          metro.median.pay.00 = median( hinc00, na.rm=T ),
          metro.race.rank.90 = ntile( (100-p.white.90), 100 ) ) %>%
  ungroup() %>%
  mutate( metro.mhv.pct.change = metro.mhv.pct.00 - metro.mhv.pct.90,
          pay.change = metro.median.pay.00 - metro.median.pay.90,
          race.change = p.white.00 - p.white.90,
          mhv.change = mhv.00 - mhv.90 )

Descriptive Statistics of Change Variables

The table below describes the summary statistics for the selected change variables.

d3 <-           
  d3 %>%
  select( c( "tractid", "cbsa", "cbsaname",
             "mhv.90", "mhv.00", "mhv.change","pct.change",
          "p.white.90", "p.black.90", "p.hisp.90", "p.asian.90", 
          "p.hs.edu.90", "p.col.edu.90", "p.prof.90",  "p.unemp.90", 
          "pov.rate.90", "p.white.00", "p.black.00", "p.hisp.00", 
          "p.asian.00", "p.hs.edu.00", "p.col.edu.00", "p.prof.00", 
          "p.unemp.00", "pov.rate.00", "metro.mhv.pct.90", 
          "metro.mhv.pct.00", "metro.median.pay.90", "metro.median.pay.00", 
          "metro.mhv.pct.change", "pay.change", "race.change",
          "metro.race.rank.90") ) 
  
# head( d3 ) %>% pander()

d3 <- data.frame(d3)
stargazer( d3, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

mhv.90	11,712	77,880	114,312	148,728	187,440	660,001
mhv.00	0	81,600	119,900	144,738	173,894	1,000,001
mhv.change	-660,001	-23,448	396	-4,122	18,968	963,701
pct.change	-100	-16	0	5	20	2,832
p.white.90	0	64	87	74	95	100
p.black.90	0	1	3	12	10	100
p.hisp.90	0	1	3	10	9	100
p.asian.90	0	0	1	3	3	94
p.hs.edu.90	0	69	74	74	80	100
p.col.edu.90	0	10	18	22	30	100
p.prof.90	0	17	25	27	34	100
p.unemp.90	0	4	5	7	8	64
pov.rate.90	0	4	8	12	16	100
p.white.00	0	47	78	67	91	100
p.black.00	0	1	4	14	14	100
p.hisp.00	0	2	4	13	15	100
p.asian.00	0	1	2	5	5	95
p.hs.edu.00	0	67	72	72	77	100
p.col.edu.00	0	12	21	26	36	100
p.prof.00	0	23	31	34	43	100
p.unemp.00	0	3	5	6	8	100
pov.rate.00	0	4	9	12	17	100
metro.mhv.pct.90	1	20	41	45	68	100
metro.mhv.pct.00	1	20	41	45	68	100
metro.median.pay.90	14,871	28,906	32,457	32,924	35,833	52,374
metro.median.pay.00	23,012	39,457	43,139	45,054	49,522	73,701
metro.mhv.pct.change	-99	-5	0	0	6	99
pay.change	4,930	9,775	11,441	12,130	14,001	26,211
race.change	-100	-12	-5	-8	-2	100
metro.race.rank.90	1	20	41	45	68	100

Operationalizing Gentrification

# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?


# home value in lower than average home in a metro in 2000
poor.1990 <- d3$metro.mhv.pct.90 < 50  

# above average diversity for metro
diverse.1990 <- d3$metro.race.rank.90 > 50 

# home values increased more than overall city gains 
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0

# faster than average growth  
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25 

# proportion of whites increases by more than 3 percent 
# measured by increase in white
loss.diversity <- d3$race.change > 3 

g.flag <- poor.1990 & diverse.1990 & mhv.pct.increase & home.val.rise & loss.diversity

num.candidates <-  sum( poor.1990 & diverse.1990, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )

num.gentrified

## [1] 371

num.candidates

## [1] 17560

num.gentrified / num.candidates

## [1] 0.02112756

By the above definition, based on the analysis only 2.1 percent of urban tracts experience gentrification between 1990 and 2000 as compared to 5.7 percent from 2000 to 2010.

17560 of census tracts are candidates, and 371 transitioned into advanced stage of gentrification.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -100.0000  -15.9418    0.4362    5.0194   20.2215 2832.2571       145

Part III

The dorling cartogram shows the census data and groups from clustering.

# DATA STEPS 

# load dorling cartogram from github
# map already contains census data and groups from clustering 

github.url <- "https://raw.githubusercontent.com/WSKQ23/william-project/main/pittt_dorling.geojson"
pit <- geojson_read( x=github.url,  what="sp" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit )

## [1] 627

plot(pit)

# create small dataframe for the merge
df <- data.frame(  tractid=d$tractid, 
        mhv.90,  mhv.00,  mhv.change,  pct.change  )

# create GEOID that matches GIS format

# create a geoID for merging by tract 
df$GEOID <- substr( df$tractid, 6, 18 )  # extract codes
df$GEOID <- gsub( "-", "", df$GEOID )    # remove hyphens
class( df$GEOID )

## [1] "character"

head( pit@data )

pit <- merge( pit, df, by.x="GEOID", by.y="GEOID" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( pit )

## [1] 627

pit <- spTransform( pit, CRS("+init=epsg:3395") )

bb <- st_bbox( c( xmin =  -9100000, xmax = -8700000, 
                  ymax = 4800000, ymin = 5000000 ), 
               crs = st_crs("+init=epsg:3395"))

## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.90", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows candidates for gentrification in 1990 disaggregated by mean home value. The map shows a higher median home value in the North-West region of the city and lower median home values toward the center.

From 2000:

pit <- spTransform( pit, CRS("+init=epsg:3395") )

## Warning: PROJ support is provided by the sf and terra packages among others

bb <- st_bbox( c( xmin =  -9100000, xmax = -8700000, 
                  ymax = 4800000, ymin = 5000000 ), 
               crs = st_crs("+init=epsg:3395")) 

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows candidates for gentrification in 2000 disaggregated by mean home value. The map shows a higher median home value in the North and South-West region of the city and lower median home values toward the center.

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows the distribution of gentrification across the city between 1990 and 2000. The map shows that change in the median home value happens more toward the north and north-west areas of Pittsburgh.

tm_shape( pit, bbox=bb ) + 
  tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral", midpoint = NA ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

The cartogram above shows that the percent change in median home value as a result of the distribution of gentrification happens in the far northern area of Pittsburgh.

Descriptive Analysis

Team 03