Migration Rates Exploration

Set libraries and directories

library(ipumsr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(knitr)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

# Change these directories to where your data file is stored and to where you want to save plots; these are in a subdirectory of the R project where the .Rmd file is located.
dataDir <- "./data"  #read data file dir
dataDir2 <- "./plots"  #save plot files dir

options(scipen=99, digits=5)

Read the data

ddi <- read_ipums_ddi(file.path(dataDir,"usa_00001.xml")) # Read the data, ACS 2019
data <- read_ipums_micro(ddi) # Read the microdata

## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

kable(head(data)) # display the first 6 rows of the data

YEAR	SAMPLE	SERIAL	CBSERIAL	HHWT	CLUSTER	STATEFIP	MET2013	STRATA	GQ	PERNUM	PERWT	SEX	AGE	RACE	RACED	BPL	BPLD	RACAMIND	RACASIAN	RACBLK	RACPACIS	RACWHT	RACOTHER	RACNUM	EDUC	EDUCD	MIGRATE1	MIGRATE1D	MIGPLAC1	MIGMET131
2019	201901	1	2019010000088	11	2019000000011	1	0	220001	4	1	11	1	39	2	200	1	100	1	1	2	1	1	1	1	4	40	1	10	0	0
2019	201901	2	2019010000096	70	2019000000021	1	0	100001	3	1	70	2	21	1	100	13	1300	1	1	1	1	2	1	1	4	40	2	23	1	0
2019	201901	3	2019010000153	20	2019000000031	1	11500	110001	4	1	20	1	19	2	200	1	100	1	1	2	1	1	1	1	7	71	2	24	1	0
2019	201901	4	2019010000198	79	2019000000041	1	11500	110001	3	1	79	1	77	1	100	1	100	1	1	1	1	2	1	1	3	30	2	23	1	11500
2019	201901	5	2019010000205	53	2019000000051	1	33660	270101	3	1	53	1	41	2	200	1	100	1	1	2	1	1	1	1	3	30	2	23	1	33660
2019	201901	6	2019010000215	77	2019000000061	1	33860	200001	4	1	77	1	18	2	200	26	2600	1	1	2	1	1	1	1	6	65	3	32	26	19820

# extract variable names the traditional way
varname <- as.data.frame(colnames(data))

# extract labels
n <- ncol(data)
labels_list <- map(1:n, function(x) attr(data[[x]], "label") )

labels_vector <- map_chr(1:n, function(x) attr(data[[x]], "label") )

# ddi info
names(ddi)

##  [1] "file_name"        "file_path"        "file_type"        "ipums_project"   
##  [5] "extract_date"     "extract_notes"    "rectypes"         "rectype_idvar"   
##  [9] "rectypes_keyvars" "var_info"         "conditions"       "citation"        
## [13] "file_encoding"

# extract file
ddi$file_name

## [1] "usa_00001.dat"

# description of extract file
ddi$extract_notes

## [1] "User-provided description:  Information about Population Migration among racial groups."

# variables in extract
ddi$var_info

## # A tibble: 34 × 10
##    var_name var_l…¹ var_d…² val_la…³ code_…⁴ start   end imp_d…⁵ var_t…⁶ recty…⁷
##    <chr>    <chr>   <chr>   <list>   <chr>   <dbl> <dbl>   <dbl> <chr>   <lgl>  
##  1 YEAR     Census… "YEAR … <tibble>  <NA>       1     4       0 integer NA     
##  2 SAMPLE   IPUMS … "SAMPL… <tibble>  <NA>       5    10       0 integer NA     
##  3 SERIAL   Househ… "SERIA… <tibble> "Codes…    11    18       0 numeric NA     
##  4 CBSERIAL Origin… "CBSER… <tibble> "Codes…    19    31       0 numeric NA     
##  5 HHWT     Househ… "HHWT … <tibble> "Codes…    32    41       2 numeric NA     
##  6 CLUSTER  Househ… "CLUST… <tibble> "Codes…    42    54       0 numeric NA     
##  7 STATEFIP State … "STATE… <tibble>  <NA>      55    56       0 integer NA     
##  8 MET2013  Metrop… "A met… <tibble>  <NA>      57    61       0 integer NA     
##  9 STRATA   Househ… "STRAT… <tibble> "Codes…    62    73       0 numeric NA     
## 10 GQ       Group … "GQ cl… <tibble>  <NA>      74    74       0 integer NA     
## # … with 24 more rows, and abbreviated variable names ¹var_label, ²var_desc,
## #   ³val_labels, ⁴code_instr, ⁵imp_decim, ⁶var_type, ⁷rectypes

# details on variable in extract
ipums_var_label(ddi, MIGRATE1)

## [1] "Migration status, 1 year [general version]"

ipums_val_labels(ddi, MIGRATE1)

## # A tibble: 6 × 2
##     val lbl                 
##   <dbl> <chr>               
## 1     0 N/A                 
## 2     1 Same house          
## 3     2 Moved within state  
## 4     3 Moved between states
## 5     4 Abroad one year ago 
## 6     9 Unknown

# make a new variable from state name from statefips labels;  these is not used in this script but useful and easy to read for later analysis
STATEFIP <- ipums_val_labels(ddi, STATEFIP)  

# add statefips labels to data as variable STATENAME
data$STATENAME <- as_factor(data$STATEFIP)

# interactively view variables, labels/descriptions, values, generates a web page
ipums_view(ddi)

# file name

Compute Unweighted Migration Rate

This code uses the dplyr package to filter the data for individuals with an age of 1 or greater, then it creates a new variable ‘n’ with a value of 1 for every individual. It then groups the data by the variable ‘SEX’ and summarizes the data by summing the ‘n’ variable for the population and for the number of individuals who have migrated within the last year. Finally, it creates a new variable called ‘interstate_mig_rate’ which is the number of interstate movers divided by the total population. It will output a single row of data with the total population, total number of interstate movers and the rate of interstate migration.

migrate_national_unweighted <- data %>% 
  filter(AGE>=1) %>% 
  mutate(n=1) %>% 
  summarise(pop=sum(n),
         interstate_movers=sum(n[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

#kable(head(migrate_national_unweighted))

Compute Weighted Migration Rate

The code above is using the %>% operator to chain together a series of data manipulation commands. The filter() function is used to keep only rows where the value in the AGE column is greater than or equal to 1. The group_by() function groups the data by the SEX column. The summarise() function is used to calculate the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the PERWT column as a weight. The mutate() function is then used to calculate the migration rate (interstate_mig_rate) as the number of interstate movers divided by the total population. The resulting data is stored in the variable migrate_national_sex. The kable() function displays the first few rows of the resulting data.

migrate_national_weighted <- data %>% 
  filter(AGE>=1) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

#kable(head(migrate_national_weighted))

Compute Weighted Migration Rates by Gender/Sex

The final output is a table where each row represents a group of people defined by their SEX and it shows the population and the number of people who moved to different state and the migration rate.

migrate_national_sex <- data %>% 
  filter(AGE>=1) %>% 
  group_by(SEX) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_national_sex))

SEX	pop	interstate_movers	interstate_mig_rate
1	159795828	3796339	0.02376
2	164941778	3705537	0.02247

Compute Weighted Migration Rates by Age

This code is creating a new dataset called “migrate_age” that is derived from the original “data” dataset. It starts by only selecting rows where the “AGE” column is greater than or equal to 1. It then groups the remaining data by the “AGE” column, and calculates the total population (pop) and the total number of people who moved across state lines (interstate_movers) using the “PERWT” column. It then calculates the migration rate by dividing the number of interstate movers by the total population. The resulting dataset is displayed using the “kable()” function, which shows the first few rows of the dataset.

migrate_age <- data %>% 
  filter(AGE>=1) %>% 
  group_by(AGE) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_age))

AGE	pop	interstate_movers	interstate_mig_rate
1	3777331	114018	0.03018
2	3913567	118238	0.03021
3	4016207	94121	0.02344
4	4106801	94470	0.02300
5	3880268	91160	0.02349
6	3869326	80178	0.02072

Plot migration rates by age: a migration schedule by age

This is code for creating a line plot using the ggplot2 package in R. The plot will display the relationship between age and migration rate, using the migrate_age data set. The ggplot function is used to specify the data set to be plotted (data=migrate_age) and the variables to be plotted on the x and y axes (aes(x=AGE, y=interstate_mig_rate)). The geom_line() and geom_point() functions are used to add a line and points to the plot, respectively. The scale_x_continuous and scale_y_continuous functions are used to add axis labels to the plot.

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_line()+
  geom_point() +
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="Migration Rate")

Smooth the age schedule, add title etc, plot using percentages

Gets rid of the jerky trend line and replace with a smoothed one. It does not capture the extremes but does show the general trends.

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_point() +
  geom_smooth(span=.24) +
  scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
  scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01), 
                     labels=percent) +
  labs(title = "Age Schedule for US Interstate Migration") +
  theme(text=element_text(size=14))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

Race schedule

This code is using the “dplyr” package in R to manipulate a data set and create a new one. The original data set is called “data”. The code first filters the data set to only include observations where the “AGE” variable is greater than or equal to 1. Then, it creates a new variable called “RACE_ETHNIC” and assigns a string value to it based on the values of the “HISPAN” and “RACE” variables.

For example, if the value of “HISPAN” is 0 and the value of “RACE” is 1, the value of “RACE_ETHNIC” would be “White”. If the value of “HISPAN” is greater than 0, the value of “RACE_ETHNIC” would be “Latinx”.

Next, it groups the data set by the “RACE_ETHNIC” variable and calculates the sum of the “PERWT” variable (which represents the population weight) for each group. It also calculates the sum of the “PERWT” variable for observations where the “MIGRATE1” variable is equal to 3 (which represents people who moved to a different state) for each group.

Finally, it creates a new variable called “interstate_mig_rate” that is the ratio of “interstate_movers” to “pop” for each group. The final step is to display the first few rows of the new data set using the “kable” function.

migrate_race <- data %>% 
  filter(AGE>=1) %>% 
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>% 
  group_by(RACE_ETHNIC) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_race))

RACE_ETHNIC	pop	interstate_movers	interstate_mig_rate
American Indian	2187817	54078	0.02472
Asian	18681193	503396	0.02695
Black	40193456	867732	0.02159
Latinx	59604788	1037598	0.01741
Mixed	8224028	268469	0.03264
Other	817627	14735	0.01802

Migration rates by race x age

The chart shows the migration rate by age and race/ethnicity. The different colors represent different racial/ethnic groups, and we can see how the migration rate varies by age within each group. This can give insight into which racial/ethnic groups tend to migrate more and at what ages they are more likely to do so.

migrate_race_age <- data %>% 
  filter(AGE>=1) %>% 
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>% 
  group_by(RACE_ETHNIC,AGE) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3]),.groups='drop') %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

ggplot(data=migrate_race_age, aes(x=AGE, y=interstate_mig_rate,color=RACE_ETHNIC)) +
  geom_line()+
  geom_point() +
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="Migration Rate")+
  labs(title = "Migration rate by Race and Age",
       subtitle = "",
       caption = "Source: data")+
  theme_minimal()