class: center, middle, inverse, title-slide # Recoding data ##
College of the Atlantic --- class: middle # Case study: Religion and income --- <img src="img/relig-income.png" width="75%" style="display: block; margin: auto;" /> .footnote[ Source: [pewforum.org/religious-landscape-study/income-distribution](https://www.pewforum.org/religious-landscape-study/income-distribution/), Retrieved 14 April, 2020 ] --- ## Read data ```r library(readxl) rel_inc <- read_excel("data/relig-income.xlsx") ``` .small[ ``` ## # A tibble: 12 x 6 ## `Religious tradition` Less ~1 $30,0~2 $50,0~3 $100,~4 Sampl~5 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Buddhist 0.36 0.18 0.32 0.13 233 ## 2 Catholic 0.36 0.19 0.26 0.19 6137 ## 3 Evangelical Protestant 0.35 0.22 0.28 0.14 7462 ## 4 Hindu 0.17 0.13 0.34 0.36 172 ## 5 Historically Black Pro~ 0.53 0.22 0.17 0.08 1704 ## 6 Jehovah's Witness 0.48 0.25 0.22 0.04 208 ## # ... with 6 more rows, and abbreviated variable names ## # 1: `Less than $30,000`, 2: `$30,000-$49,999`, ## # 3: `$50,000-$99,999`, 4: `$100,000 or more`, ## # 5: `Sample Size` ``` ] --- ## Rename columns .midi[ ```r rel_inc %>% rename( religion = `Religious tradition`, n = `Sample Size` ) ``` ``` ## # A tibble: 12 x 6 ## religion Less ~1 $30,0~2 $50,0~3 $100,~4 n ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Buddhist 0.36 0.18 0.32 0.13 233 ## 2 Catholic 0.36 0.19 0.26 0.19 6137 ## 3 Evangelical Protestant 0.35 0.22 0.28 0.14 7462 ## 4 Hindu 0.17 0.13 0.34 0.36 172 ## 5 Historically Black Prote~ 0.53 0.22 0.17 0.08 1704 ## 6 Jehovah's Witness 0.48 0.25 0.22 0.04 208 ## # ... with 6 more rows, and abbreviated variable names ## # 1: `Less than $30,000`, 2: `$30,000-$49,999`, ## # 3: `$50,000-$99,999`, 4: `$100,000 or more` ``` ] --- .question[ If we want a new variable called `income` with levels such as "Less than $30,000", "$30,000-$49,999", ... etc. which function should we use? ] ``` ## # A tibble: 48 x 4 ## religion n income proportion ## <chr> <dbl> <chr> <dbl> ## 1 Buddhist 233 Less than $30,000 0.36 ## 2 Buddhist 233 $30,000-$49,999 0.18 ## 3 Buddhist 233 $50,000-$99,999 0.32 ## 4 Buddhist 233 $100,000 or more 0.13 ## 5 Catholic 6137 Less than $30,000 0.36 ## 6 Catholic 6137 $30,000-$49,999 0.19 ## 7 Catholic 6137 $50,000-$99,999 0.26 ## 8 Catholic 6137 $100,000 or more 0.19 ## 9 Evangelical Protestant 7462 Less than $30,000 0.35 ## 10 Evangelical Protestant 7462 $30,000-$49,999 0.22 ## 11 Evangelical Protestant 7462 $50,000-$99,999 0.28 ## 12 Evangelical Protestant 7462 $100,000 or more 0.14 ## 13 Hindu 172 Less than $30,000 0.17 ## 14 Hindu 172 $30,000-$49,999 0.13 ## 15 Hindu 172 $50,000-$99,999 0.34 ## # ... with 33 more rows ``` --- ## Pivot longer .midi[ ```r rel_inc %>% rename( religion = `Religious tradition`, n = `Sample Size` ) %>% pivot_longer( cols = -c(religion, n), # all but religion and n names_to = "income", values_to = "proportion" ) ``` ``` ## # A tibble: 48 x 4 ## religion n income proportion ## <chr> <dbl> <chr> <dbl> ## 1 Buddhist 233 Less than $30,000 0.36 ## 2 Buddhist 233 $30,000-$49,999 0.18 ## 3 Buddhist 233 $50,000-$99,999 0.32 ## 4 Buddhist 233 $100,000 or more 0.13 ## 5 Catholic 6137 Less than $30,000 0.36 ## 6 Catholic 6137 $30,000-$49,999 0.19 ## # ... with 42 more rows ``` ] --- ## Calculate frequencies .midi[ ```r rel_inc %>% rename( religion = `Religious tradition`, n = `Sample Size` ) %>% pivot_longer( cols = -c(religion, n), names_to = "income", values_to = "proportion" ) %>% mutate(frequency = round(proportion * n)) ``` ``` ## # A tibble: 48 x 5 ## religion n income proportion frequency ## <chr> <dbl> <chr> <dbl> <dbl> ## 1 Buddhist 233 Less than $30,000 0.36 84 ## 2 Buddhist 233 $30,000-$49,999 0.18 42 ## 3 Buddhist 233 $50,000-$99,999 0.32 75 ## 4 Buddhist 233 $100,000 or more 0.13 30 ## 5 Catholic 6137 Less than $30,000 0.36 2209 ## 6 Catholic 6137 $30,000-$49,999 0.19 1166 ## # ... with 42 more rows ``` ] --- ## Save data ```r rel_inc_long <- rel_inc %>% rename( religion = `Religious tradition`, n = `Sample Size` ) %>% pivot_longer( cols = -c(religion, n), names_to = "income", values_to = "proportion" ) %>% mutate(frequency = round(proportion * n)) ``` --- ## Barplot ```r ggplot(rel_inc_long, aes(y = religion, x = frequency)) + geom_col() ``` <img src="u2-d13-data-recode_files/figure-html/bar plot-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Recode religion .panelset[ .panel[.panel-name[Recode] ```r rel_inc_long <- rel_inc_long %>% mutate(religion = case_when( religion == "Evangelical Protestant" ~ "Ev. Protestant", religion == "Historically Black Protestant" ~ "Hist. Black Protestant", religion == 'Unaffiliated (religious "nones")' ~ "Unaffiliated", TRUE ~ religion )) ``` ] .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/unnamed-chunk-11-1.png" width="65%" style="display: block; margin: auto;" /> ] ] --- ## Reverse religion order .panelset[ .panel[.panel-name[Recode] ```r rel_inc_long <- rel_inc_long %>% mutate(religion = fct_rev(religion)) ``` ] .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/unnamed-chunk-13-1.png" width="65%" style="display: block; margin: auto;" /> ] ] --- ## Add income .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/rel-income-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col() ``` ] ] --- ## Fill bars .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/rel-income-fill-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") ``` ] ] --- ## Change colors .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/rel-income-fill-viridis-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") + scale_fill_viridis_d() ``` ] ] --- ## Change theme .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/rel-income-fill-viridis-minimal-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") + scale_fill_viridis_d() + theme_minimal() ``` ] ] --- ## Move legend to the bottom .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/bottom-legend-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") + scale_fill_viridis_d() + theme_minimal() + theme(legend.position = "bottom") ``` ] ] --- ## Legend adjustments .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/unnamed-chunk-19-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") + scale_fill_viridis_d() + theme_minimal() + theme(legend.position = "bottom") + guides(fill = guide_legend(nrow = 2, byrow = TRUE)) ``` ] ] --- ## Fix labels .panelset[ .panel[.panel-name[Plot] <img src="u2-d13-data-recode_files/figure-html/unnamed-chunk-20-1.png" width="65%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(rel_inc_long, aes(y = religion, x = frequency, fill = income)) + geom_col(position = "fill") + scale_fill_viridis_d() + theme_minimal() + theme(legend.position = "bottom") + guides(fill = guide_legend(nrow = 2, byrow = TRUE)) + labs( x = "Proportion", y = "", title = "Income distribution by religious group", subtitle = "Source: Pew Research Center, Religious Landscape Study", fill = "Income" ) ``` ] ] --- ## Acknowledgements * This course builds on the materials from [Data Science in a Box](https://datasciencebox.org/) developed by Mine Çetinkaya-Rundel and are adapted under the [Creative Commons Attribution Share Alike 4.0 International](https://github.com/rstudio-education/datascience-box/blob/master/LICENSE.md)