library(ipumsr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Change these directories to where your data file is stored and to where you want to save plots; these are in a subdirectory of the R project where the .Rmd file is located.
dataDir <- "./data" #read data file dir
dataDir2 <- "./plots" #save plot files dir
options(scipen=99, digits=5)
ddi <- read_ipums_ddi(file.path(dataDir,"usa_00001.xml")) # Read the data, ACS 2019
data <- read_ipums_micro(ddi) # Read the microdata
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
kable(head(data)) # display the first 6 rows of the data
| YEAR | SAMPLE | SERIAL | CBSERIAL | HHWT | CLUSTER | STATEFIP | MET2013 | STRATA | GQ | PERNUM | PERWT | SEX | AGE | RACE | RACED | HISPAN | HISPAND | BPL | BPLD | CITIZEN | RACAMIND | RACASIAN | RACBLK | RACPACIS | RACWHT | RACOTHER | RACNUM | EDUC | EDUCD | MIGRATE1 | MIGRATE1D | MIGPLAC1 | MIGMET131 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2019 | 201901 | 1 | 2019010000088 | 11 | 2019000000011 | 1 | 0 | 220001 | 4 | 1 | 11 | 1 | 39 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 4 | 40 | 1 | 10 | 0 | 0 |
| 2019 | 201901 | 2 | 2019010000096 | 70 | 2019000000021 | 1 | 0 | 100001 | 3 | 1 | 70 | 2 | 21 | 1 | 100 | 0 | 0 | 13 | 1300 | 0 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 4 | 40 | 2 | 23 | 1 | 0 |
| 2019 | 201901 | 3 | 2019010000153 | 20 | 2019000000031 | 1 | 11500 | 110001 | 4 | 1 | 20 | 1 | 19 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 7 | 71 | 2 | 24 | 1 | 0 |
| 2019 | 201901 | 4 | 2019010000198 | 79 | 2019000000041 | 1 | 11500 | 110001 | 3 | 1 | 79 | 1 | 77 | 1 | 100 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 3 | 30 | 2 | 23 | 1 | 11500 |
| 2019 | 201901 | 5 | 2019010000205 | 53 | 2019000000051 | 1 | 33660 | 270101 | 3 | 1 | 53 | 1 | 41 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 3 | 30 | 2 | 23 | 1 | 33660 |
| 2019 | 201901 | 6 | 2019010000215 | 77 | 2019000000061 | 1 | 33860 | 200001 | 4 | 1 | 77 | 1 | 18 | 2 | 200 | 0 | 0 | 26 | 2600 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 6 | 65 | 3 | 32 | 26 | 19820 |
# extract variable names the traditional way
varname <- as.data.frame(colnames(data))
# extract labels
n <- ncol(data)
labels_list <- map(1:n, function(x) attr(data[[x]], "label") )
labels_vector <- map_chr(1:n, function(x) attr(data[[x]], "label") )
# ddi info
names(ddi)
## [1] "file_name" "file_path" "file_type" "ipums_project"
## [5] "extract_date" "extract_notes" "rectypes" "rectype_idvar"
## [9] "rectypes_keyvars" "var_info" "conditions" "citation"
## [13] "file_encoding"
# extract file
ddi$file_name
## [1] "usa_00001.dat"
# description of extract file
ddi$extract_notes
## [1] "User-provided description: Information about Population Migration among racial groups."
# variables in extract
ddi$var_info
## # A tibble: 34 × 10
## var_name var_l…¹ var_d…² val_la…³ code_…⁴ start end imp_d…⁵ var_t…⁶ recty…⁷
## <chr> <chr> <chr> <list> <chr> <dbl> <dbl> <dbl> <chr> <lgl>
## 1 YEAR Census… "YEAR … <tibble> <NA> 1 4 0 integer NA
## 2 SAMPLE IPUMS … "SAMPL… <tibble> <NA> 5 10 0 integer NA
## 3 SERIAL Househ… "SERIA… <tibble> "Codes… 11 18 0 numeric NA
## 4 CBSERIAL Origin… "CBSER… <tibble> "Codes… 19 31 0 numeric NA
## 5 HHWT Househ… "HHWT … <tibble> "Codes… 32 41 2 numeric NA
## 6 CLUSTER Househ… "CLUST… <tibble> "Codes… 42 54 0 numeric NA
## 7 STATEFIP State … "STATE… <tibble> <NA> 55 56 0 integer NA
## 8 MET2013 Metrop… "A met… <tibble> <NA> 57 61 0 integer NA
## 9 STRATA Househ… "STRAT… <tibble> "Codes… 62 73 0 numeric NA
## 10 GQ Group … "GQ cl… <tibble> <NA> 74 74 0 integer NA
## # … with 24 more rows, and abbreviated variable names ¹var_label, ²var_desc,
## # ³val_labels, ⁴code_instr, ⁵imp_decim, ⁶var_type, ⁷rectypes
# details on variable in extract
ipums_var_label(ddi, MIGRATE1)
## [1] "Migration status, 1 year [general version]"
ipums_val_labels(ddi, MIGRATE1)
## # A tibble: 6 × 2
## val lbl
## <dbl> <chr>
## 1 0 N/A
## 2 1 Same house
## 3 2 Moved within state
## 4 3 Moved between states
## 5 4 Abroad one year ago
## 6 9 Unknown
# make a new variable from state name from statefips labels; these is not used in this script but useful and easy to read for later analysis
STATEFIP <- ipums_val_labels(ddi, STATEFIP)
# add statefips labels to data as variable STATENAME
data$STATENAME <- as_factor(data$STATEFIP)
# interactively view variables, labels/descriptions, values, generates a web page
ipums_view(ddi)
# file name
This code uses the dplyr package to filter the data for individuals with an age of 1 or greater, then it creates a new variable ‘n’ with a value of 1 for every individual. It then groups the data by the variable ‘SEX’ and summarizes the data by summing the ‘n’ variable for the population and for the number of individuals who have migrated within the last year. Finally, it creates a new variable called ‘interstate_mig_rate’ which is the number of interstate movers divided by the total population. It will output a single row of data with the total population, total number of interstate movers and the rate of interstate migration.
migrate_national_unweighted <- data %>%
filter(AGE>=1) %>%
mutate(n=1) %>%
summarise(pop=sum(n),
interstate_movers=sum(n[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
#kable(head(migrate_national_unweighted))
The code above is using the %>% operator to chain together a series of data manipulation commands. The filter() function is used to keep only rows where the value in the AGE column is greater than or equal to 1. The group_by() function groups the data by the SEX column. The summarise() function is used to calculate the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the PERWT column as a weight. The mutate() function is then used to calculate the migration rate (interstate_mig_rate) as the number of interstate movers divided by the total population. The resulting data is stored in the variable migrate_national_sex. The kable() function displays the first few rows of the resulting data.
migrate_national_weighted <- data %>%
filter(AGE>=1) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
#kable(head(migrate_national_weighted))
The final output is a table where each row represents a group of people defined by their SEX and it shows the population and the number of people who moved to different state and the migration rate.
migrate_national_sex <- data %>%
filter(AGE>=1) %>%
group_by(SEX) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_national_sex))
| SEX | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| 1 | 159795828 | 3796339 | 0.02376 |
| 2 | 164941778 | 3705537 | 0.02247 |
This code is creating a new dataset called “migrate_age” that is derived from the original “data” dataset. It starts by only selecting rows where the “AGE” column is greater than or equal to 1. It then groups the remaining data by the “AGE” column, and calculates the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the “PERWT” column. It then calculates the migration rate by dividing the number of interstate movers by the total population. The resulting dataset is displayed using the “kable()” function, which shows the first few rows of the dataset.
migrate_age <- data %>%
filter(AGE>=1) %>%
group_by(AGE) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_age))
| AGE | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| 1 | 3777331 | 114018 | 0.03018 |
| 2 | 3913567 | 118238 | 0.03021 |
| 3 | 4016207 | 94121 | 0.02344 |
| 4 | 4106801 | 94470 | 0.02300 |
| 5 | 3880268 | 91160 | 0.02349 |
| 6 | 3869326 | 80178 | 0.02072 |
This is code for creating a line plot using the ggplot2 package in R. The plot will display the relationship between age and migration rate, using the migrate_age data set. The ggplot function is used to specify the data set to be plotted (data=migrate_age) and the variables to be plotted on the x and y axes (aes(x=AGE, y=interstate_mig_rate)). The geom_line() and geom_point() functions are used to add a line and points to the plot, respectively. The scale_x_continuous and scale_y_continuous functions are used to add axis labels to the plot.
ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
geom_line()+
geom_point() +
scale_x_continuous(name="Age") +
scale_y_continuous(name="Migration Rate")
Gets rid of the jerky trend line and replace with a smoothed one. It does not capture the extremes but does show the general trends.
ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
geom_point() +
geom_smooth(span=.24) +
scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01),
labels=percent) +
labs(title = "Age Schedule for US Interstate Migration") +
theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
This code is using the “dplyr” package in R to manipulate a data set and create a new one. The original data set is called “data”. The code first filters the data set to only include observations where the “AGE” variable is greater than or equal to 1. Then, it creates a new variable called “RACE_ETHNIC” and assigns a string value to it based on the values of the “HISPAN” and “RACE” variables.
For example, if the value of “HISPAN” is 0 and the value of “RACE” is 1, the value of “RACE_ETHNIC” would be “White”. If the value of “HISPAN” is greater than 0, the value of “RACE_ETHNIC” would be “Latinx”.
Next, it groups the data set by the “RACE_ETHNIC” variable and calculates the sum of the “PERWT” variable (which represents the population weight) for each group. It also calculates the sum of the “PERWT” variable for observations where the “MIGRATE1” variable is equal to 3 (which represents people who moved to a different state) for each group.
Finally, it creates a new variable called “interstate_mig_rate” that is the ratio of “interstate_movers” to “pop” for each group. The final step is to display the first few rows of the new data set using the “kable” function.
migrate_race <- data %>%
filter(AGE>=1) %>%
mutate(RACE_ETHNIC=case_when(
HISPAN==0 & RACE==1 ~ "White",
HISPAN==0 & RACE==2 ~ "Black",
HISPAN==0 & RACE==3 ~ "American Indian",
HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
HISPAN==0 & RACE==7 ~ "Other",
HISPAN==0 & RACE>=8 ~ "Mixed",
HISPAN>0 ~ "Latinx")) %>%
group_by(RACE_ETHNIC) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_race))
| RACE_ETHNIC | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| American Indian | 2187817 | 54078 | 0.02472 |
| Asian | 18681193 | 503396 | 0.02695 |
| Black | 40193456 | 867732 | 0.02159 |
| Latinx | 59604788 | 1037598 | 0.01741 |
| Mixed | 8224028 | 268469 | 0.03264 |
| Other | 817627 | 14735 | 0.01802 |
The chart shows the migration rate by age and race/ethnicity. The different colors represent different racial/ethnic groups, and we can see how the migration rate varies by age within each group. This can give insight into which racial/ethnic groups tend to migrate more and at what ages they are more likely to do so.
migrate_race_age <- data %>%
filter(AGE>=1) %>%
mutate(RACE_ETHNIC=case_when(
HISPAN==0 & RACE==1 ~ "White",
HISPAN==0 & RACE==2 ~ "Black",
HISPAN==0 & RACE==3 ~ "American Indian",
HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
HISPAN==0 & RACE==7 ~ "Other",
HISPAN==0 & RACE>=8 ~ "Mixed",
HISPAN>0 ~ "Latinx")) %>%
group_by(RACE_ETHNIC,AGE) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3]),.groups='drop') %>%
mutate(interstate_mig_rate=interstate_movers/pop)
ggplot(data=migrate_race_age, aes(x=AGE, y=interstate_mig_rate,color=RACE_ETHNIC)) +
geom_line()+
geom_point() +
scale_x_continuous(name="Age") +
scale_y_continuous(name="Migration Rate")+
labs(title = "Migration rate by Race and Age",
subtitle = "",
caption = "Source: data")+
theme_minimal()