Set libraries and directories

library(ipumsr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Change these directories to where your data file is stored and to where you want to save plots; these are in a subdirectory of the R project where the .Rmd file is located.
dataDir <- "./data"  #read data file dir
dataDir2 <- "./plots"  #save plot files dir

options(scipen=99, digits=5)

Read the data

ddi <- read_ipums_ddi(file.path(dataDir,"usa_00001.xml")) # Read the data, ACS 2019
data <- read_ipums_micro(ddi) # Read the microdata
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
kable(head(data)) # display the first 6 rows of the data
YEAR SAMPLE SERIAL CBSERIAL HHWT CLUSTER STATEFIP MET2013 STRATA GQ PERNUM PERWT SEX AGE RACE RACED HISPAN HISPAND BPL BPLD CITIZEN RACAMIND RACASIAN RACBLK RACPACIS RACWHT RACOTHER RACNUM EDUC EDUCD MIGRATE1 MIGRATE1D MIGPLAC1 MIGMET131
2019 201901 1 2019010000088 11 2019000000011 1 0 220001 4 1 11 1 39 2 200 0 0 1 100 0 1 1 2 1 1 1 1 4 40 1 10 0 0
2019 201901 2 2019010000096 70 2019000000021 1 0 100001 3 1 70 2 21 1 100 0 0 13 1300 0 1 1 1 1 2 1 1 4 40 2 23 1 0
2019 201901 3 2019010000153 20 2019000000031 1 11500 110001 4 1 20 1 19 2 200 0 0 1 100 0 1 1 2 1 1 1 1 7 71 2 24 1 0
2019 201901 4 2019010000198 79 2019000000041 1 11500 110001 3 1 79 1 77 1 100 0 0 1 100 0 1 1 1 1 2 1 1 3 30 2 23 1 11500
2019 201901 5 2019010000205 53 2019000000051 1 33660 270101 3 1 53 1 41 2 200 0 0 1 100 0 1 1 2 1 1 1 1 3 30 2 23 1 33660
2019 201901 6 2019010000215 77 2019000000061 1 33860 200001 4 1 77 1 18 2 200 0 0 26 2600 0 1 1 2 1 1 1 1 6 65 3 32 26 19820
# extract variable names the traditional way
varname <- as.data.frame(colnames(data))

# extract labels
n <- ncol(data)
labels_list <- map(1:n, function(x) attr(data[[x]], "label") )

labels_vector <- map_chr(1:n, function(x) attr(data[[x]], "label") )

# ddi info
names(ddi)
##  [1] "file_name"        "file_path"        "file_type"        "ipums_project"   
##  [5] "extract_date"     "extract_notes"    "rectypes"         "rectype_idvar"   
##  [9] "rectypes_keyvars" "var_info"         "conditions"       "citation"        
## [13] "file_encoding"
# extract file
ddi$file_name
## [1] "usa_00001.dat"
# description of extract file
ddi$extract_notes
## [1] "User-provided description:  Information about Population Migration among racial groups."
# variables in extract
ddi$var_info
## # A tibble: 34 × 10
##    var_name var_l…¹ var_d…² val_la…³ code_…⁴ start   end imp_d…⁵ var_t…⁶ recty…⁷
##    <chr>    <chr>   <chr>   <list>   <chr>   <dbl> <dbl>   <dbl> <chr>   <lgl>  
##  1 YEAR     Census… "YEAR … <tibble>  <NA>       1     4       0 integer NA     
##  2 SAMPLE   IPUMS … "SAMPL… <tibble>  <NA>       5    10       0 integer NA     
##  3 SERIAL   Househ… "SERIA… <tibble> "Codes…    11    18       0 numeric NA     
##  4 CBSERIAL Origin… "CBSER… <tibble> "Codes…    19    31       0 numeric NA     
##  5 HHWT     Househ… "HHWT … <tibble> "Codes…    32    41       2 numeric NA     
##  6 CLUSTER  Househ… "CLUST… <tibble> "Codes…    42    54       0 numeric NA     
##  7 STATEFIP State … "STATE… <tibble>  <NA>      55    56       0 integer NA     
##  8 MET2013  Metrop… "A met… <tibble>  <NA>      57    61       0 integer NA     
##  9 STRATA   Househ… "STRAT… <tibble> "Codes…    62    73       0 numeric NA     
## 10 GQ       Group … "GQ cl… <tibble>  <NA>      74    74       0 integer NA     
## # … with 24 more rows, and abbreviated variable names ¹​var_label, ²​var_desc,
## #   ³​val_labels, ⁴​code_instr, ⁵​imp_decim, ⁶​var_type, ⁷​rectypes
# details on variable in extract
ipums_var_label(ddi, MIGRATE1)
## [1] "Migration status, 1 year [general version]"
ipums_val_labels(ddi, MIGRATE1)
## # A tibble: 6 × 2
##     val lbl                 
##   <dbl> <chr>               
## 1     0 N/A                 
## 2     1 Same house          
## 3     2 Moved within state  
## 4     3 Moved between states
## 5     4 Abroad one year ago 
## 6     9 Unknown
# make a new variable from state name from statefips labels;  these is not used in this script but useful and easy to read for later analysis
STATEFIP <- ipums_val_labels(ddi, STATEFIP)  

# add statefips labels to data as variable STATENAME
data$STATENAME <- as_factor(data$STATEFIP)

# interactively view variables, labels/descriptions, values, generates a web page
ipums_view(ddi)

# file name

Compute Unweighted Migration Rate

This code uses the dplyr package to filter the data for individuals with an age of 1 or greater, then it creates a new variable ‘n’ with a value of 1 for every individual. It then groups the data by the variable ‘SEX’ and summarizes the data by summing the ‘n’ variable for the population and for the number of individuals who have migrated within the last year. Finally, it creates a new variable called ‘interstate_mig_rate’ which is the number of interstate movers divided by the total population. It will output a single row of data with the total population, total number of interstate movers and the rate of interstate migration.

migrate_national_unweighted <- data %>% 
  filter(AGE>=1) %>% 
  mutate(n=1) %>% 
  summarise(pop=sum(n),
         interstate_movers=sum(n[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

#kable(head(migrate_national_unweighted))

Compute Weighted Migration Rate

The code above is using the %>% operator to chain together a series of data manipulation commands. The filter() function is used to keep only rows where the value in the AGE column is greater than or equal to 1. The group_by() function groups the data by the SEX column. The summarise() function is used to calculate the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the PERWT column as a weight. The mutate() function is then used to calculate the migration rate (interstate_mig_rate) as the number of interstate movers divided by the total population. The resulting data is stored in the variable migrate_national_sex. The kable() function displays the first few rows of the resulting data.

migrate_national_weighted <- data %>% 
  filter(AGE>=1) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

#kable(head(migrate_national_weighted))

Compute Weighted Migration Rates by Gender/Sex

The final output is a table where each row represents a group of people defined by their SEX and it shows the population and the number of people who moved to different state and the migration rate.

migrate_national_sex <- data %>% 
  filter(AGE>=1) %>% 
  group_by(SEX) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_national_sex))
SEX pop interstate_movers interstate_mig_rate
1 159795828 3796339 0.02376
2 164941778 3705537 0.02247

Compute Weighted Migration Rates by Age

This code is creating a new dataset called “migrate_age” that is derived from the original “data” dataset. It starts by only selecting rows where the “AGE” column is greater than or equal to 1. It then groups the remaining data by the “AGE” column, and calculates the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the “PERWT” column. It then calculates the migration rate by dividing the number of interstate movers by the total population. The resulting dataset is displayed using the “kable()” function, which shows the first few rows of the dataset.

migrate_age <- data %>% 
  filter(AGE>=1) %>% 
  group_by(AGE) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_age))
AGE pop interstate_movers interstate_mig_rate
1 3777331 114018 0.03018
2 3913567 118238 0.03021
3 4016207 94121 0.02344
4 4106801 94470 0.02300
5 3880268 91160 0.02349
6 3869326 80178 0.02072

Plot migration rates by age: a migration schedule by age

This is code for creating a line plot using the ggplot2 package in R. The plot will display the relationship between age and migration rate, using the migrate_age data set. The ggplot function is used to specify the data set to be plotted (data=migrate_age) and the variables to be plotted on the x and y axes (aes(x=AGE, y=interstate_mig_rate)). The geom_line() and geom_point() functions are used to add a line and points to the plot, respectively. The scale_x_continuous and scale_y_continuous functions are used to add axis labels to the plot.

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_line()+
  geom_point() +
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="Migration Rate")

Smooth the age schedule, add title etc, plot using percentages

Gets rid of the jerky trend line and replace with a smoothed one. It does not capture the extremes but does show the general trends.

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_point() +
  geom_smooth(span=.24) +
  scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
  scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01), 
                     labels=percent) +
  labs(title = "Age Schedule for US Interstate Migration") +
  theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Race schedule

This code is using the “dplyr” package in R to manipulate a data set and create a new one. The original data set is called “data”. The code first filters the data set to only include observations where the “AGE” variable is greater than or equal to 1. Then, it creates a new variable called “RACE_ETHNIC” and assigns a string value to it based on the values of the “HISPAN” and “RACE” variables.

For example, if the value of “HISPAN” is 0 and the value of “RACE” is 1, the value of “RACE_ETHNIC” would be “White”. If the value of “HISPAN” is greater than 0, the value of “RACE_ETHNIC” would be “Latinx”.

Next, it groups the data set by the “RACE_ETHNIC” variable and calculates the sum of the “PERWT” variable (which represents the population weight) for each group. It also calculates the sum of the “PERWT” variable for observations where the “MIGRATE1” variable is equal to 3 (which represents people who moved to a different state) for each group.

Finally, it creates a new variable called “interstate_mig_rate” that is the ratio of “interstate_movers” to “pop” for each group. The final step is to display the first few rows of the new data set using the “kable” function.

migrate_race <- data %>% 
  filter(AGE>=1) %>% 
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>% 
  group_by(RACE_ETHNIC) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_race))
RACE_ETHNIC pop interstate_movers interstate_mig_rate
American Indian 2187817 54078 0.02472
Asian 18681193 503396 0.02695
Black 40193456 867732 0.02159
Latinx 59604788 1037598 0.01741
Mixed 8224028 268469 0.03264
Other 817627 14735 0.01802

Migration rates by race x age

The chart shows the migration rate by age and race/ethnicity. The different colors represent different racial/ethnic groups, and we can see how the migration rate varies by age within each group. This can give insight into which racial/ethnic groups tend to migrate more and at what ages they are more likely to do so.

migrate_race_age <- data %>% 
  filter(AGE>=1) %>% 
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>% 
  group_by(RACE_ETHNIC,AGE) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3]),.groups='drop') %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

ggplot(data=migrate_race_age, aes(x=AGE, y=interstate_mig_rate,color=RACE_ETHNIC)) +
  geom_line()+
  geom_point() +
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="Migration Rate")+
  labs(title = "Migration rate by Race and Age",
       subtitle = "",
       caption = "Source: data")+
  theme_minimal()