tidycensus
, load in variables on median household income and median housing value from the acs5
2013-2017.scale_fill_viridis
to create natural breaks for color sequencing and to mapHouse-Price-to-Income RatioR has a variety of functions that allow the user to create color scales easily. These functions generally require you to specify a color on each end of the spectrum and they will interpolate the values between based upon how many levels you desire.
We select colors to communicate information about our data. If we are using a continuous variable the most basic decision is whether we want to represent it as positive and negative deviations from the average (a divergent scale), or as a continuum of low to high values (a sequential scale). If we have categorical data, it is generally visualized through different colors representing each group (a qualitative scale).
plot( 1:7, rep(1,7), ylim=c(-0.5,3.5), xlim=c(0,12), yaxt="n", xaxt="n", bty="n", xlab="", ylab="" )
color.function <- colorRampPalette( c("gray80","darkred") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(3,7), pch=15, cex=8, col=col.ramp )
color.function <- colorRampPalette( c("darkred","gray80","steelblue") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(2,7), pch=15, cex=8, col=col.ramp )
color.function <- colorRampPalette( c("gray80","black") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(1,7), pch=15, cex=8, col=col.ramp )
text( 8, 3, "Sequential", pos=4 )
text( 8, 2, "Divergent", pos=4 )
text( 8, 1, "Grayscale", pos=4 )
HHInc_HousePrice_Ratio
from continuous variable to factor variable with 2 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = 0.5))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("low","high"))
## Assign 2 colors
col.ramp <- viridis(n = 2) # number of groups you desire
HHInc_HousePrice_Ratio
from continuous variable to factor variable with 3 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .33))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("low","med", "high"))
## Assign 3 colors
col.ramp <- viridis(n = 3) # number of groups you desire
HHInc_HousePrice_Ratio
from continuous variable to factor variable with 5 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .2))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("1","2","3", "4","5"))
## Assign 5 colors
col.ramp <- viridis(n = 5) # number of groups you desire
HHInc_HousePrice_Ratio
from continuous variable to factor variable with 6 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = c(0,.1,.25,.5,.75,.9,1))
#CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
# to = c("< .1",".1-.25",".25-.5", ".5-.75",".75-.9","> .9"))
## Assign 6 colors
col.ramp <- viridis(n = 6) # number of groups you desire
ggplot(CenDF) +
geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_viridis(direction=-1)
Our price ratio is skewed because of the long right tail:
As a result, most of our data will be crammed into the bottom half of our scale, making the colors a lot harder to interpret.
We can consider ways to transform the scale before visualization.
Logging values can pull-in outliers:
We can also top-code scales to remove the long tail:
# top-coding
CenDF$inc.home.ratio <- CenDF$HHInc_HousePrice_Ratio
CenDF$inc.home.ratio[ CenDF$inc.home.ratio > 5 ] <- 5
hist( CenDF$inc.home.ratio,
breaks=50, col="darkgray" )
Let’s also select a divergent scale to better differentiate high and low values (a neutral color like gray represents average values).
The pals package has some crisp color scales.
### devtools::install_github("kwstat/pals")
library( pals )
pal.map( ocean.balance, n=13, main="Example Divergent Colorscale")
Let’s see how this changes our interpretation of the data:
## REGULAR DIVERGENT SCALE WHEN DATA IS SKEWED
ggplot(CenDF) +
geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
coord_sf( datum=NA ) +
labs( title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio" ) +
scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )
## TOP-CODED SCALE - MAX IS RATIO OF 6
CenDF$inc.home.ratio <- CenDF$HHInc_HousePrice_Ratio
CenDF$inc.home.ratio[ CenDF$inc.home.ratio > 6 ] <- 6
ggplot(CenDF) +
geom_sf(aes(fill = inc.home.ratio), color=NA) +
coord_sf( datum=NA ) +
labs( title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio" ) +
scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )
## LOGGED RATIO
CenDF$log.price.ratio <- log( CenDF$HHInc_HousePrice_Ratio + 1 )
ggplot(CenDF) +
geom_sf(aes(fill = log.price.ratio), color=NA) +
coord_sf( datum=NA ) +
labs( title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio (log)" ) +
scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )