Cleaning

Set up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import Data

df <- read_csv("data/Global Missing Migrants.csv")
Rows: 13020 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): Incident Type, Reported Month, Region of Origin, Region of Inciden...
dbl  (8): Incident year, Number of Dead, Minimum Estimated Number of Missing...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect structure

str(df)
spc_tbl_ [13,020 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Incident Type                      : chr [1:13020] "Incident" "Incident" "Incident" "Incident" ...
 $ Incident year                      : num [1:13020] 2014 2014 2014 2014 2014 ...
 $ Reported Month                     : chr [1:13020] "January" "January" "January" "January" ...
 $ Region of Origin                   : chr [1:13020] "Central America" "Latin America / Caribbean (P)" "Latin America / Caribbean (P)" "Central America" ...
 $ Region of Incident                 : chr [1:13020] "North America" "North America" "North America" "North America" ...
 $ Country of Origin                  : chr [1:13020] "Guatemala" "Unknown" "Unknown" "Mexico" ...
 $ Number of Dead                     : num [1:13020] 1 1 1 1 1 1 12 1 1 1 ...
 $ Minimum Estimated Number of Missing: num [1:13020] 0 0 0 0 0 0 0 0 0 0 ...
 $ Total Number of Dead and Missing   : num [1:13020] 1 1 1 1 1 1 12 1 1 1 ...
 $ Number of Survivors                : num [1:13020] 0 0 0 0 2 0 0 0 0 0 ...
 $ Number of Females                  : num [1:13020] 0 0 0 0 0 0 9 0 0 0 ...
 $ Number of Males                    : num [1:13020] 1 0 0 1 1 0 0 0 0 0 ...
 $ Number of Children                 : num [1:13020] 0 0 0 0 0 0 3 0 0 0 ...
 $ Cause of Death                     : chr [1:13020] "Mixed or unknown" "Mixed or unknown" "Mixed or unknown" "Violence" ...
 $ Migration route                    : chr [1:13020] "US-Mexico border crossing" "US-Mexico border crossing" "US-Mexico border crossing" "US-Mexico border crossing" ...
 $ Location of death                  : chr [1:13020] "Pima Country Office of the Medical Examiner jurisdiction, Arizona, USA (see coordinates for exact location)" "Pima Country Office of the Medical Examiner jurisdiction, Arizona, USA (see coordinates for exact location)" "Pima Country Office of the Medical Examiner jurisdiction, Arizona, USA (see coordinates for exact location)" "near Douglas, Arizona, USA" ...
 $ Information Source                 : chr [1:13020] "Pima County Office of the Medical Examiner (PCOME)" "Pima County Office of the Medical Examiner (PCOME)" "Pima County Office of the Medical Examiner (PCOME)" "Ministry of Foreign Affairs Mexico, Pima County Office of the Medical Examiner (PCOME)" ...
 $ Coordinates                        : chr [1:13020] "31.650259, -110.366453" "31.59713, -111.73756" "31.94026, -113.01125" "31.506777, -109.315632" ...
 $ UNSD Geographical Grouping         : chr [1:13020] "Northern America" "Northern America" "Northern America" "Northern America" ...
 - attr(*, "spec")=
  .. cols(
  ..   `Incident Type` = col_character(),
  ..   `Incident year` = col_double(),
  ..   `Reported Month` = col_character(),
  ..   `Region of Origin` = col_character(),
  ..   `Region of Incident` = col_character(),
  ..   `Country of Origin` = col_character(),
  ..   `Number of Dead` = col_double(),
  ..   `Minimum Estimated Number of Missing` = col_double(),
  ..   `Total Number of Dead and Missing` = col_double(),
  ..   `Number of Survivors` = col_double(),
  ..   `Number of Females` = col_double(),
  ..   `Number of Males` = col_double(),
  ..   `Number of Children` = col_double(),
  ..   `Cause of Death` = col_character(),
  ..   `Migration route` = col_character(),
  ..   `Location of death` = col_character(),
  ..   `Information Source` = col_character(),
  ..   Coordinates = col_character(),
  ..   `UNSD Geographical Grouping` = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
glimpse(df)
Rows: 13,020
Columns: 19
$ `Incident Type`                       <chr> "Incident", "Incident", "Inciden…
$ `Incident year`                       <dbl> 2014, 2014, 2014, 2014, 2014, 20…
$ `Reported Month`                      <chr> "January", "January", "January",…
$ `Region of Origin`                    <chr> "Central America", "Latin Americ…
$ `Region of Incident`                  <chr> "North America", "North America"…
$ `Country of Origin`                   <chr> "Guatemala", "Unknown", "Unknown…
$ `Number of Dead`                      <dbl> 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1…
$ `Minimum Estimated Number of Missing` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Total Number of Dead and Missing`    <dbl> 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1…
$ `Number of Survivors`                 <dbl> 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,…
$ `Number of Females`                   <dbl> 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0,…
$ `Number of Males`                     <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,…
$ `Number of Children`                  <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,…
$ `Cause of Death`                      <chr> "Mixed or unknown", "Mixed or un…
$ `Migration route`                     <chr> "US-Mexico border crossing", "US…
$ `Location of death`                   <chr> "Pima Country Office of the Medi…
$ `Information Source`                  <chr> "Pima County Office of the Medic…
$ Coordinates                           <chr> "31.650259, -110.366453", "31.59…
$ `UNSD Geographical Grouping`          <chr> "Northern America", "Northern Am…

Data Cleaning

# Standardizing and Categorizing Causes of Death

# Separate multiple causes and trim whitespace
df_cleaned1 <- df |>
  separate_rows(`Cause of Death`, sep = ",") |>
  mutate(`Cause of Death` = str_trim(`Cause of Death`))

# Standardize cause names
df_cleaned2 <- df_cleaned1 |>
  mutate(`Cause of Death` = case_when(
    str_to_lower(`Cause of Death`) %in% c(
      "food", "water", 
      "harsh environmental conditions / lack of adequate shelter"
    ) ~ "Lack of food and water",
    str_to_lower(`Cause of Death`) == "vehicle accident / death linked to hazardous transport" ~ "Vehicle accident",
    str_to_lower(`Cause of Death`) == "sickness / lack of access to adequate healthcare" ~ "Sickness",
    str_to_lower(`Cause of Death`) == "mixed or unknown" ~ "Unknown or Mixed",
    TRUE ~ `Cause of Death`
  )) |>
  filter(!str_to_lower(`Cause of Death`) %in% c("food", "water"))

# Categorize causes into broader categories
df_cleaned3 <- df_cleaned2 |>
  mutate(cause_category = case_when(
    str_detect(`Cause of Death`, regex("drowning", ignore_case = TRUE)) ~ "Drowning",
    str_detect(`Cause of Death`, regex("violence", ignore_case = TRUE)) ~ "Violence",
    str_detect(`Cause of Death`, regex("vehicle", ignore_case = TRUE)) ~ "Hazardous Transport",
    str_detect(`Cause of Death`, regex("environmental|shelter|lack of food and water", ignore_case = TRUE)) ~ "Environmental Exposure",
    str_detect(`Cause of Death`, regex("sickness|healthcare", ignore_case = TRUE)) ~ "Lack of Healthcare",
    str_detect(`Cause of Death`, regex("accidental", ignore_case = TRUE)) ~ "Accidental",
    str_detect(`Cause of Death`, regex("unknown|mixed", ignore_case = TRUE)) ~ "Unknown or Mixed",
    TRUE ~ "Other"
  ))

# View cleaned causes
df_cleaned3 |> 
  distinct(`Cause of Death`) |> 
  arrange(`Cause of Death`)
# A tibble: 7 × 1
  `Cause of Death`      
  <chr>                 
1 Accidental death      
2 Drowning              
3 Lack of food and water
4 Sickness              
5 Unknown or Mixed      
6 Vehicle accident      
7 Violence              
# Fomatting Temporal Variables
df_cleaned4 <- df_cleaned3|>
  mutate(`Reported Month` = str_to_title(`Reported Month`),`Reported Month` = factor(`Reported Month`, levels = month.name, ordered = TRUE), `Incident year` = as.integer(`Incident year`))
# Standardizing Geographic and Route Information

# Unify Regions and Routes
df_cleaned5 <- df_cleaned4|>
  mutate(
    `Region of Origin` = str_replace_all(`Region of Origin`, "\\s*/\\s*Caribbean \\(P\\)", " & Caribbean"),
    `Region of Incident` = str_replace_all(`Region of Incident`, "\\s*/\\s*Caribbean \\(P\\)", " & Caribbean"),
    `Migration route` = str_to_sentence(`Migration route`)
  )

# Remove or Truncate Verbose Text Columns
df_cleaned5|>
  mutate(`Location of death` = str_extract(`Location of death`, "^(.*?)(,|\\()"),
    `Location of death` = str_remove_all(`Location of death`, ",|\\("))
# A tibble: 15,786 × 20
   `Incident Type` `Incident year` `Reported Month` `Region of Origin`       
   <chr>                     <int> <ord>            <chr>                    
 1 Incident                   2014 January          Central America          
 2 Incident                   2014 January          Latin America & Caribbean
 3 Incident                   2014 January          Latin America & Caribbean
 4 Incident                   2014 January          Central America          
 5 Incident                   2014 January          Northern Africa          
 6 Incident                   2014 January          Northern Africa          
 7 Incident                   2014 January          Northern Africa          
 8 Incident                   2014 January          Latin America & Caribbean
 9 Incident                   2014 January          Unknown                  
10 Incident                   2014 January          Latin America & Caribbean
# ℹ 15,776 more rows
# ℹ 16 more variables: `Region of Incident` <chr>, `Country of Origin` <chr>,
#   `Number of Dead` <dbl>, `Minimum Estimated Number of Missing` <dbl>,
#   `Total Number of Dead and Missing` <dbl>, `Number of Survivors` <dbl>,
#   `Number of Females` <dbl>, `Number of Males` <dbl>,
#   `Number of Children` <dbl>, `Cause of Death` <chr>,
#   `Migration route` <chr>, `Location of death` <chr>, …
# Remove Redundant Columns
df_cleaned6 <- df_cleaned5|>
  select(-`Number of Dead`,-`Minimum Estimated Number of Missing`,-`Number of Survivors`,-`Information Source`,-`UNSD Geographical Grouping`
         ,Coordinates)

glimpse(df_cleaned6)
Rows: 15,786
Columns: 15
$ `Incident Type`                    <chr> "Incident", "Incident", "Incident",…
$ `Incident year`                    <int> 2014, 2014, 2014, 2014, 2014, 2014,…
$ `Reported Month`                   <ord> January, January, January, January,…
$ `Region of Origin`                 <chr> "Central America", "Latin America &…
$ `Region of Incident`               <chr> "North America", "North America", "…
$ `Country of Origin`                <chr> "Guatemala", "Unknown", "Unknown", …
$ `Total Number of Dead and Missing` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1…
$ `Number of Females`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0,…
$ `Number of Males`                  <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,…
$ `Number of Children`               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
$ `Cause of Death`                   <chr> "Unknown or Mixed", "Unknown or Mix…
$ `Migration route`                  <chr> "Us-mexico border crossing", "Us-me…
$ `Location of death`                <chr> "Pima Country Office of the Medical…
$ Coordinates                        <chr> "31.650259, -110.366453", "31.59713…
$ cause_category                     <chr> "Unknown or Mixed", "Unknown or Mix…
# Cleaning Column Names
df_cleaned7 <- df_cleaned6

names(df_cleaned7) <- tolower(gsub(" ", "_", names(df_cleaned7)))

df_cleaned8 <- df_cleaned7|>
  rename(total_death_missing = total_number_of_dead_and_missing)

df_cleaned <- df_cleaned8

glimpse(df_cleaned)
Rows: 15,786
Columns: 15
$ incident_type       <chr> "Incident", "Incident", "Incident", "Incident", "I…
$ incident_year       <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 20…
$ reported_month      <ord> January, January, January, January, January, Janua…
$ region_of_origin    <chr> "Central America", "Latin America & Caribbean", "L…
$ region_of_incident  <chr> "North America", "North America", "North America",…
$ country_of_origin   <chr> "Guatemala", "Unknown", "Unknown", "Mexico", "Suda…
$ total_death_missing <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1, 1, 1…
$ number_of_females   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0,…
$ number_of_males     <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
$ number_of_children  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1,…
$ cause_of_death      <chr> "Unknown or Mixed", "Unknown or Mixed", "Unknown o…
$ migration_route     <chr> "Us-mexico border crossing", "Us-mexico border cro…
$ location_of_death   <chr> "Pima Country Office of the Medical Examiner juris…
$ coordinates         <chr> "31.650259, -110.366453", "31.59713, -111.73756", …
$ cause_category      <chr> "Unknown or Mixed", "Unknown or Mixed", "Unknown o…

Save Data

write.csv(df_cleaned, file = "cleaned_data.csv")
save(df_cleaned, file = "cleaned_data.RData")