Initial LA County Small Business Cleaning

Install Necessary Packages

install.packages('tidyverse')

## Installing package into 'C:/Users/Eduardo Galvez/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Eduardo Galvez\AppData\Local\Temp\RtmpwDqDle\downloaded_packages

Import Packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)

Load in datasets

demo_data <- suppressMessages(read_csv('ca_demo_by_zip.csv', skip = 1 , show_col_types = FALSE))
active_biz <- suppressMessages(read_csv('Listing_of_Active_Businesses.csv', show_col_types = FALSE))
#Name and Trade Name of Firm; Contact; Address and City, State Zip; Capabilities Narrative; E-mail Address; Year Established, putting this here for now to showcase 


#Updated dataset, this contains manufacturing, services, r&d, and construction
#after looking into the data from the small business database, it seems that there
#is lots of overlap
small_biz <- suppressMessages(read_csv('small_biz_data.txt',show_col_types = FALSE))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

Get summary of datasets

summary(demo_data)
summary(active_biz)
summary(small_biz)

Cleaning the Active Business Dataset—————————————————————————————-

Clean the Active Business Dataset by extracting zip Code and remove any null/invalid zip code values

clean_active_biz <- active_biz %>%
  mutate(`ZIP CODE` = substr(`ZIP CODE`, 1, 5))


num_nulls <- sum(is.na(clean_active_biz$`ZIP CODE`) | nchar(clean_active_biz$`ZIP CODE`) < 5)
print(num_nulls)

## [1] 327

filter_active_biz <- clean_active_biz %>%
  filter(nchar(`ZIP CODE`) == 5)


zip_num_count <- grepl("^\\d+$", filter_active_biz$`ZIP CODE`)
count_table <- table(zip_num_count)
print(count_table)

## zip_num_count
##  FALSE   TRUE 
##     63 574492

zip_only_int <- filter_active_biz[zip_num_count, ]



head(zip_only_int)

## # A tibble: 6 × 16
##   `LOCATION ACCOUNT #` `BUSINESS NAME`         `DBA NAME` `STREET ADDRESS` CITY 
##   <chr>                <chr>                   <chr>      <chr>            <chr>
## 1 0000000150-0001-5    A A OFICINA CENTRAL HI… <NA>       2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2    SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2    AUSTIN B CREEK          <NA>       3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1    SOEUNG CHAING           CAFECAFE   1330 W PACIFIC … WILM…
## 5 0002728984-0001-8    MARIA ALVAREZ           <NA>       9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1    BELINDA MIXON-JOY       BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## #   `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## #   NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## #   `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>

Clean Null values within NAICS description column

non_null_rows <- zip_only_int %>%
  filter(!is.na(`PRIMARY NAICS DESCRIPTION`))


head(non_null_rows)

## # A tibble: 6 × 16
##   `LOCATION ACCOUNT #` `BUSINESS NAME`         `DBA NAME` `STREET ADDRESS` CITY 
##   <chr>                <chr>                   <chr>      <chr>            <chr>
## 1 0000000150-0001-5    A A OFICINA CENTRAL HI… <NA>       2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2    SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2    AUSTIN B CREEK          <NA>       3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1    SOEUNG CHAING           CAFECAFE   1330 W PACIFIC … WILM…
## 5 0002728984-0001-8    MARIA ALVAREZ           <NA>       9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1    BELINDA MIXON-JOY       BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## #   `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## #   NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## #   `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>

Clean Null Business Name

clean_active_biz_name <- non_null_rows %>% 
  filter(!is.na(`BUSINESS NAME`))


head(clean_active_biz_name)

## # A tibble: 6 × 16
##   `LOCATION ACCOUNT #` `BUSINESS NAME`         `DBA NAME` `STREET ADDRESS` CITY 
##   <chr>                <chr>                   <chr>      <chr>            <chr>
## 1 0000000150-0001-5    A A OFICINA CENTRAL HI… <NA>       2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2    SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2    AUSTIN B CREEK          <NA>       3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1    SOEUNG CHAING           CAFECAFE   1330 W PACIFIC … WILM…
## 5 0002728984-0001-8    MARIA ALVAREZ           <NA>       9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1    BELINDA MIXON-JOY       BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## #   `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## #   NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## #   `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>

#luckily there are no null business name values in this column

Drop the location end date, as all the values are null

  clean_active_biz_name <- clean_active_biz_name %>% select(-`LOCATION END DATE`)

Clean the Small Business Dataset ——————————————————————————————-

Clean the zip code column

  #similar cleaning to the active business dataset
  clean_small_biz_zip <- small_biz %>%
    mutate(`Zip` = substr(`Zip`, 1, 5))
  
  
  sb_num_nulls <- sum(is.na(clean_small_biz_zip$`Zip`) | nchar(clean_small_biz_zip$`Zip`) < 5)
  print(sb_num_nulls)

## [1] 1

  valid_zip <- clean_small_biz_zip %>%
    filter(nchar(`Zip`) == 5)
  
  
  sb_zip_num_count <- grepl("^\\d+$", valid_zip$`Zip`)
  sb_count_table <- table(sb_zip_num_count)
  print(sb_count_table)

## sb_zip_num_count
## TRUE 
## 6769

  sb_zip_only_int <- valid_zip[sb_zip_num_count, ]
  
  
  
  head(sb_zip_only_int)

## # A tibble: 6 × 13
##    View `Name of Firm`    `Trade Name` unknown...4 unknown...5 `Address, line 1`
##   <dbl> <chr>             <chr>        <chr>       <chr>       <chr>            
## 1     1 Maya Advertising… <NA>         Luis Vasqu… PRESIDENT   300 The Village  
## 2     2 SIGNATURE OFFICE… <NA>         GARY KRIEG… General Ma… 6033 S CENTRAL A…
## 3     3 CHEETAH EXPRESS   <NA>         LEE ELLIS   <NA>        3010 WILSHIRE BL…
## 4     4 SAESHE ADVERTISI… <NA>         YOUNG YU    VICE PRES   1055 WEST 7TH ST…
## 5     5 ELECTROLIZING INC <NA>         JACK  MORG… SALES       1947 HOOPER AVE  
## 6     6 STEMAR EQUIPMENT… <NA>         TIM MARKON… PRESIDENT   353 S CENTRAL AVE
## # ℹ 7 more variables: `Address, line 2` <chr>, City <chr>, State <chr>,
## #   Zip <chr>, `Capabilities Narrative` <chr>, `E-mail Address` <chr>,
## #   `Year Established` <dbl>

Clean + Combine the unknown…4 and unknown…5 into the contact column

sb_clean_contact <- sb_zip_only_int %>% 
  filter(!is.na(`unknown...4`))

print(sb_clean_contact)

## # A tibble: 6,706 × 13
##     View `Name of Firm`   `Trade Name` unknown...4 unknown...5 `Address, line 1`
##    <dbl> <chr>            <chr>        <chr>       <chr>       <chr>            
##  1     1 Maya Advertisin… <NA>         Luis Vasqu… PRESIDENT   300 The Village  
##  2     2 SIGNATURE OFFIC… <NA>         GARY KRIEG… General Ma… 6033 S CENTRAL A…
##  3     3 CHEETAH EXPRESS  <NA>         LEE ELLIS   <NA>        3010 WILSHIRE BL…
##  4     4 SAESHE ADVERTIS… <NA>         YOUNG YU    VICE PRES   1055 WEST 7TH ST…
##  5     5 ELECTROLIZING I… <NA>         JACK  MORG… SALES       1947 HOOPER AVE  
##  6     6 STEMAR EQUIPMEN… <NA>         TIM MARKON… PRESIDENT   353 S CENTRAL AVE
##  7     7 NATURAL BUILDIN… <NA>         CHARLES YI  Project Co… 16610 S WESTERN …
##  8     8 U M S INC        UNITED MAIN… BO LEE      General Ma… 2140 W OLYMPIC B…
##  9     9 STOVER SEED COM… <NA>         STEPHEN KN… President   1415 EAST SIXTH …
## 10    10 HOLLYWOOD FILM … <NA>         VINCENT J … Director o… 9265 BORDEN AVE  
## # ℹ 6,696 more rows
## # ℹ 7 more variables: `Address, line 2` <chr>, City <chr>, State <chr>,
## #   Zip <chr>, `Capabilities Narrative` <chr>, `E-mail Address` <chr>,
## #   `Year Established` <dbl>

  sb_contact_combine <- sb_clean_contact %>%
  mutate(Contact = paste(`unknown...4`, `unknown...5`, sep = ", ")) %>%
  select(-`unknown...4`, -`unknown...5`)



  print(sb_contact_combine)

## # A tibble: 6,706 × 12
##     View `Name of Firm`   `Trade Name` `Address, line 1` `Address, line 2` City 
##    <dbl> <chr>            <chr>        <chr>             <chr>             <chr>
##  1     1 Maya Advertisin… <NA>         300 The Village   <NA>              Redo…
##  2     2 SIGNATURE OFFIC… <NA>         6033 S CENTRAL A… <NA>              LOS …
##  3     3 CHEETAH EXPRESS  <NA>         3010 WILSHIRE BL… <NA>              LOS …
##  4     4 SAESHE ADVERTIS… <NA>         1055 WEST 7TH ST… <NA>              LOS …
##  5     5 ELECTROLIZING I… <NA>         1947 HOOPER AVE   <NA>              LOS …
##  6     6 STEMAR EQUIPMEN… <NA>         353 S CENTRAL AVE <NA>              LOS …
##  7     7 NATURAL BUILDIN… <NA>         16610 S WESTERN … <NA>              GARD…
##  8     8 U M S INC        UNITED MAIN… 2140 W OLYMPIC B… <NA>              LOS …
##  9     9 STOVER SEED COM… <NA>         1415 EAST SIXTH … <NA>              LOS …
## 10    10 HOLLYWOOD FILM … <NA>         9265 BORDEN AVE   <NA>              SUN …
## # ℹ 6,696 more rows
## # ℹ 6 more variables: State <chr>, Zip <chr>, `Capabilities Narrative` <chr>,
## #   `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>

Remove null from: “Name of Firm”, “Capbilities Narrative”

sb_clean <- sb_contact_combine %>% 
  filter(!is.na(`Name of Firm`))

print(sb_clean)

## # A tibble: 6,706 × 12
##     View `Name of Firm`   `Trade Name` `Address, line 1` `Address, line 2` City 
##    <dbl> <chr>            <chr>        <chr>             <chr>             <chr>
##  1     1 Maya Advertisin… <NA>         300 The Village   <NA>              Redo…
##  2     2 SIGNATURE OFFIC… <NA>         6033 S CENTRAL A… <NA>              LOS …
##  3     3 CHEETAH EXPRESS  <NA>         3010 WILSHIRE BL… <NA>              LOS …
##  4     4 SAESHE ADVERTIS… <NA>         1055 WEST 7TH ST… <NA>              LOS …
##  5     5 ELECTROLIZING I… <NA>         1947 HOOPER AVE   <NA>              LOS …
##  6     6 STEMAR EQUIPMEN… <NA>         353 S CENTRAL AVE <NA>              LOS …
##  7     7 NATURAL BUILDIN… <NA>         16610 S WESTERN … <NA>              GARD…
##  8     8 U M S INC        UNITED MAIN… 2140 W OLYMPIC B… <NA>              LOS …
##  9     9 STOVER SEED COM… <NA>         1415 EAST SIXTH … <NA>              LOS …
## 10    10 HOLLYWOOD FILM … <NA>         9265 BORDEN AVE   <NA>              SUN …
## # ℹ 6,696 more rows
## # ℹ 6 more variables: State <chr>, Zip <chr>, `Capabilities Narrative` <chr>,
## #   `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>

sb_clean <- sb_clean %>% 
  filter(!is.na(`Capabilities Narrative`))


print(sb_clean)

## # A tibble: 6,020 × 12
##     View `Name of Firm`   `Trade Name` `Address, line 1` `Address, line 2` City 
##    <dbl> <chr>            <chr>        <chr>             <chr>             <chr>
##  1     1 Maya Advertisin… <NA>         300 The Village   <NA>              Redo…
##  2     2 SIGNATURE OFFIC… <NA>         6033 S CENTRAL A… <NA>              LOS …
##  3     3 CHEETAH EXPRESS  <NA>         3010 WILSHIRE BL… <NA>              LOS …
##  4     4 SAESHE ADVERTIS… <NA>         1055 WEST 7TH ST… <NA>              LOS …
##  5     5 ELECTROLIZING I… <NA>         1947 HOOPER AVE   <NA>              LOS …
##  6     6 STEMAR EQUIPMEN… <NA>         353 S CENTRAL AVE <NA>              LOS …
##  7     7 NATURAL BUILDIN… <NA>         16610 S WESTERN … <NA>              GARD…
##  8     8 U M S INC        UNITED MAIN… 2140 W OLYMPIC B… <NA>              LOS …
##  9     9 STOVER SEED COM… <NA>         1415 EAST SIXTH … <NA>              LOS …
## 10    10 HOLLYWOOD FILM … <NA>         9265 BORDEN AVE   <NA>              SUN …
## # ℹ 6,010 more rows
## # ℹ 6 more variables: State <chr>, Zip <chr>, `Capabilities Narrative` <chr>,
## #   `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>

Drop view column from Small Business dataset, essentially a row numerator

  sb_clean <- sb_clean %>% select(-`View`)

Clean California Demographics Dataset—————————————————————————————

Keep relevant columns and rename

  demo_data <- demo_data %>% 
    select(
      "Geographic Area Name",
      "Estimate!!SEX AND AGE!!Total population!!Under 5 years",
      "Estimate!!SEX AND AGE!!Total population!!5 to 9 years",
      "Estimate!!SEX AND AGE!!Total population!!10 to 14 years",
      "Estimate!!SEX AND AGE!!Total population!!15 to 19 years",
      "Estimate!!SEX AND AGE!!Total population!!20 to 24 years",
      "Estimate!!SEX AND AGE!!Total population!!25 to 34 years",
      "Estimate!!SEX AND AGE!!Total population!!35 to 44 years",
      "Estimate!!SEX AND AGE!!Total population!!45 to 54 years",
      "Estimate!!SEX AND AGE!!Total population!!55 to 59 years",
      "Estimate!!SEX AND AGE!!Total population!!60 to 64 years",
      "Estimate!!SEX AND AGE!!Total population!!65 to 74 years",
      "Estimate!!SEX AND AGE!!Total population!!75 to 84 years",
      "Estimate!!SEX AND AGE!!Total population!!85 years and over",
      "Estimate!!SEX AND AGE!!Total population!!Median age (years)"
    ) %>%
    rename(
      "Zip_code" = "Geographic Area Name",
      "est_under_5" = "Estimate!!SEX AND AGE!!Total population!!Under 5 years",
      "est_5_to_9" = "Estimate!!SEX AND AGE!!Total population!!5 to 9 years",
      "est_10_to_14" = "Estimate!!SEX AND AGE!!Total population!!10 to 14 years",
      "est_15_to_19" = "Estimate!!SEX AND AGE!!Total population!!15 to 19 years",
      "est_20_to_24" = "Estimate!!SEX AND AGE!!Total population!!20 to 24 years",
      "est_25_to_34" = "Estimate!!SEX AND AGE!!Total population!!25 to 34 years",
      "est_35_to_44" = "Estimate!!SEX AND AGE!!Total population!!35 to 44 years",
      "est_45_to_54" = "Estimate!!SEX AND AGE!!Total population!!45 to 54 years",
      "est_55_to_59" = "Estimate!!SEX AND AGE!!Total population!!55 to 59 years",
      "est_60_to_64" = "Estimate!!SEX AND AGE!!Total population!!60 to 64 years",
      "est_65_to_74" = "Estimate!!SEX AND AGE!!Total population!!65 to 74 years",
      "est_75_to_84" = "Estimate!!SEX AND AGE!!Total population!!75 to 84 years",
      "est_85_over" = "Estimate!!SEX AND AGE!!Total population!!85 years and over",
      "est_median_age" = "Estimate!!SEX AND AGE!!Total population!!Median age (years)"
    )

  colnames(demo_data)

##  [1] "Zip_code"       "est_under_5"    "est_5_to_9"     "est_10_to_14"  
##  [5] "est_15_to_19"   "est_20_to_24"   "est_25_to_34"   "est_35_to_44"  
##  [9] "est_45_to_54"   "est_55_to_59"   "est_60_to_64"   "est_65_to_74"  
## [13] "est_75_to_84"   "est_85_over"    "est_median_age"

Clean zip code column

  #get the last 5 chars to get the zip code
  demo_data <- demo_data %>%
    mutate(`Zip_code` = substr(`Zip_code`, nchar(`Zip_code`) - 4, nchar(`Zip_code`)))

  summary(demo_data)

##    Zip_code          est_under_5     est_5_to_9    est_10_to_14 
##  Length:1769        Min.   :   0   Min.   :   0   Min.   :   0  
##  Class :character   1st Qu.:  59   1st Qu.:  71   1st Qu.:  73  
##  Mode  :character   Median : 833   Median : 868   Median : 877  
##                     Mean   :1362   Mean   :1375   Mean   :1469  
##                     3rd Qu.:2215   3rd Qu.:2255   3rd Qu.:2348  
##                     Max.   :8224   Max.   :9724   Max.   :9449  
##   est_15_to_19    est_20_to_24    est_25_to_34    est_35_to_44  
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:   72   1st Qu.:   65   1st Qu.:  147   1st Qu.:  165  
##  Median :  854   Median :  880   Median : 2014   Median : 1968  
##  Mean   : 1440   Mean   : 1522   Mean   : 3396   Mean   : 2959  
##  3rd Qu.: 2296   3rd Qu.: 2341   3rd Qu.: 5630   3rd Qu.: 4960  
##  Max.   :12266   Max.   :16890   Max.   :18544   Max.   :15930  
##   est_45_to_54    est_55_to_59   est_60_to_64   est_65_to_74   est_75_to_84   
##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :   0   Min.   :   0.0  
##  1st Qu.:  178   1st Qu.: 103   1st Qu.: 114   1st Qu.: 187   1st Qu.:  81.0  
##  Median : 2022   Median :1030   Median : 992   Median :1412   Median : 653.0  
##  Mean   : 2850   Mean   :1406   Mean   :1276   Mean   :1851   Mean   : 910.9  
##  3rd Qu.: 4717   3rd Qu.:2356   3rd Qu.:2162   3rd Qu.:3066   3rd Qu.:1509.0  
##  Max.   :13794   Max.   :6993   Max.   :6315   Max.   :8537   Max.   :5172.0  
##   est_85_over     est_median_age    
##  Min.   :   0.0   Length:1769       
##  1st Qu.:  24.0   Class :character  
##  Median : 264.0   Mode  :character  
##  Mean   : 432.5                     
##  3rd Qu.: 741.0                     
##  Max.   :3169.0

  #make sure there are no null or zip codes with length less than 5
  demo_zip_nulls <- sum(is.na(demo_data$`Zip_code`) | nchar(demo_data$`Zip_code`) < 5)
  print(num_nulls)

## [1] 327

  demo_data <- demo_data %>%
    filter(nchar(`Zip_code`) == 5)
  
  
  demo_zip_num_count <- grepl("^\\d+$", demo_data$`Zip_code`)
  count_table <- table(demo_zip_num_count)
  print(count_table)

## demo_zip_num_count
## TRUE 
## 1769

  demo_data <- demo_data[demo_zip_num_count, ]
  summary(demo_data)

##    Zip_code          est_under_5     est_5_to_9    est_10_to_14 
##  Length:1769        Min.   :   0   Min.   :   0   Min.   :   0  
##  Class :character   1st Qu.:  59   1st Qu.:  71   1st Qu.:  73  
##  Mode  :character   Median : 833   Median : 868   Median : 877  
##                     Mean   :1362   Mean   :1375   Mean   :1469  
##                     3rd Qu.:2215   3rd Qu.:2255   3rd Qu.:2348  
##                     Max.   :8224   Max.   :9724   Max.   :9449  
##   est_15_to_19    est_20_to_24    est_25_to_34    est_35_to_44  
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:   72   1st Qu.:   65   1st Qu.:  147   1st Qu.:  165  
##  Median :  854   Median :  880   Median : 2014   Median : 1968  
##  Mean   : 1440   Mean   : 1522   Mean   : 3396   Mean   : 2959  
##  3rd Qu.: 2296   3rd Qu.: 2341   3rd Qu.: 5630   3rd Qu.: 4960  
##  Max.   :12266   Max.   :16890   Max.   :18544   Max.   :15930  
##   est_45_to_54    est_55_to_59   est_60_to_64   est_65_to_74   est_75_to_84   
##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :   0   Min.   :   0.0  
##  1st Qu.:  178   1st Qu.: 103   1st Qu.: 114   1st Qu.: 187   1st Qu.:  81.0  
##  Median : 2022   Median :1030   Median : 992   Median :1412   Median : 653.0  
##  Mean   : 2850   Mean   :1406   Mean   :1276   Mean   :1851   Mean   : 910.9  
##  3rd Qu.: 4717   3rd Qu.:2356   3rd Qu.:2162   3rd Qu.:3066   3rd Qu.:1509.0  
##  Max.   :13794   Max.   :6993   Max.   :6315   Max.   :8537   Max.   :5172.0  
##   est_85_over     est_median_age    
##  Min.   :   0.0   Length:1769       
##  1st Qu.:  24.0   Class :character  
##  Median : 264.0   Mode  :character  
##  Mean   : 432.5                     
##  3rd Qu.: 741.0                     
##  Max.   :3169.0

Remove any nulls present in Demographic data rows

  summary(demo_data)

##    Zip_code          est_under_5     est_5_to_9    est_10_to_14 
##  Length:1769        Min.   :   0   Min.   :   0   Min.   :   0  
##  Class :character   1st Qu.:  59   1st Qu.:  71   1st Qu.:  73  
##  Mode  :character   Median : 833   Median : 868   Median : 877  
##                     Mean   :1362   Mean   :1375   Mean   :1469  
##                     3rd Qu.:2215   3rd Qu.:2255   3rd Qu.:2348  
##                     Max.   :8224   Max.   :9724   Max.   :9449  
##   est_15_to_19    est_20_to_24    est_25_to_34    est_35_to_44  
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:   72   1st Qu.:   65   1st Qu.:  147   1st Qu.:  165  
##  Median :  854   Median :  880   Median : 2014   Median : 1968  
##  Mean   : 1440   Mean   : 1522   Mean   : 3396   Mean   : 2959  
##  3rd Qu.: 2296   3rd Qu.: 2341   3rd Qu.: 5630   3rd Qu.: 4960  
##  Max.   :12266   Max.   :16890   Max.   :18544   Max.   :15930  
##   est_45_to_54    est_55_to_59   est_60_to_64   est_65_to_74   est_75_to_84   
##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :   0   Min.   :   0.0  
##  1st Qu.:  178   1st Qu.: 103   1st Qu.: 114   1st Qu.: 187   1st Qu.:  81.0  
##  Median : 2022   Median :1030   Median : 992   Median :1412   Median : 653.0  
##  Mean   : 2850   Mean   :1406   Mean   :1276   Mean   :1851   Mean   : 910.9  
##  3rd Qu.: 4717   3rd Qu.:2356   3rd Qu.:2162   3rd Qu.:3066   3rd Qu.:1509.0  
##  Max.   :13794   Max.   :6993   Max.   :6315   Max.   :8537   Max.   :5172.0  
##   est_85_over     est_median_age    
##  Min.   :   0.0   Length:1769       
##  1st Qu.:  24.0   Class :character  
##  Median : 264.0   Mode  :character  
##  Mean   : 432.5                     
##  3rd Qu.: 741.0                     
##  Max.   :3169.0

  demo_data <- na.omit(demo_data)

  summary(demo_data)

##    Zip_code          est_under_5     est_5_to_9    est_10_to_14 
##  Length:1769        Min.   :   0   Min.   :   0   Min.   :   0  
##  Class :character   1st Qu.:  59   1st Qu.:  71   1st Qu.:  73  
##  Mode  :character   Median : 833   Median : 868   Median : 877  
##                     Mean   :1362   Mean   :1375   Mean   :1469  
##                     3rd Qu.:2215   3rd Qu.:2255   3rd Qu.:2348  
##                     Max.   :8224   Max.   :9724   Max.   :9449  
##   est_15_to_19    est_20_to_24    est_25_to_34    est_35_to_44  
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:   72   1st Qu.:   65   1st Qu.:  147   1st Qu.:  165  
##  Median :  854   Median :  880   Median : 2014   Median : 1968  
##  Mean   : 1440   Mean   : 1522   Mean   : 3396   Mean   : 2959  
##  3rd Qu.: 2296   3rd Qu.: 2341   3rd Qu.: 5630   3rd Qu.: 4960  
##  Max.   :12266   Max.   :16890   Max.   :18544   Max.   :15930  
##   est_45_to_54    est_55_to_59   est_60_to_64   est_65_to_74   est_75_to_84   
##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :   0   Min.   :   0.0  
##  1st Qu.:  178   1st Qu.: 103   1st Qu.: 114   1st Qu.: 187   1st Qu.:  81.0  
##  Median : 2022   Median :1030   Median : 992   Median :1412   Median : 653.0  
##  Mean   : 2850   Mean   :1406   Mean   :1276   Mean   :1851   Mean   : 910.9  
##  3rd Qu.: 4717   3rd Qu.:2356   3rd Qu.:2162   3rd Qu.:3066   3rd Qu.:1509.0  
##  Max.   :13794   Max.   :6993   Max.   :6315   Max.   :8537   Max.   :5172.0  
##   est_85_over     est_median_age    
##  Min.   :   0.0   Length:1769       
##  1st Qu.:  24.0   Class :character  
##  Median : 264.0   Mode  :character  
##  Mean   : 432.5                     
##  3rd Qu.: 741.0                     
##  Max.   :3169.0

Check for duplicates

  dupe_demo <- duplicated(demo_data)
  a <- sum(dupe_demo)

  dupe_sb_clean <- duplicated(sb_clean)
  b <- sum(dupe_sb_clean)

  dupe_active_biz <- duplicated(clean_active_biz_name)
  c <- sum(dupe_active_biz)
  
  paste("Number of duplicate rows in demo data:", a)

## [1] "Number of duplicate rows in demo data: 0"

  paste("Number of duplicate rows in small business data:", b)

## [1] "Number of duplicate rows in small business data: 0"

  paste("Number of duplicate rows in active business data:", c)

## [1] "Number of duplicate rows in active business data: 0"

Creating a report for each of the datasets to check integrity

  summary(demo_data)

##    Zip_code          est_under_5     est_5_to_9    est_10_to_14 
##  Length:1769        Min.   :   0   Min.   :   0   Min.   :   0  
##  Class :character   1st Qu.:  59   1st Qu.:  71   1st Qu.:  73  
##  Mode  :character   Median : 833   Median : 868   Median : 877  
##                     Mean   :1362   Mean   :1375   Mean   :1469  
##                     3rd Qu.:2215   3rd Qu.:2255   3rd Qu.:2348  
##                     Max.   :8224   Max.   :9724   Max.   :9449  
##   est_15_to_19    est_20_to_24    est_25_to_34    est_35_to_44  
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:   72   1st Qu.:   65   1st Qu.:  147   1st Qu.:  165  
##  Median :  854   Median :  880   Median : 2014   Median : 1968  
##  Mean   : 1440   Mean   : 1522   Mean   : 3396   Mean   : 2959  
##  3rd Qu.: 2296   3rd Qu.: 2341   3rd Qu.: 5630   3rd Qu.: 4960  
##  Max.   :12266   Max.   :16890   Max.   :18544   Max.   :15930  
##   est_45_to_54    est_55_to_59   est_60_to_64   est_65_to_74   est_75_to_84   
##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :   0   Min.   :   0.0  
##  1st Qu.:  178   1st Qu.: 103   1st Qu.: 114   1st Qu.: 187   1st Qu.:  81.0  
##  Median : 2022   Median :1030   Median : 992   Median :1412   Median : 653.0  
##  Mean   : 2850   Mean   :1406   Mean   :1276   Mean   :1851   Mean   : 910.9  
##  3rd Qu.: 4717   3rd Qu.:2356   3rd Qu.:2162   3rd Qu.:3066   3rd Qu.:1509.0  
##  Max.   :13794   Max.   :6993   Max.   :6315   Max.   :8537   Max.   :5172.0  
##   est_85_over     est_median_age    
##  Min.   :   0.0   Length:1769       
##  1st Qu.:  24.0   Class :character  
##  Median : 264.0   Mode  :character  
##  Mean   : 432.5                     
##  3rd Qu.: 741.0                     
##  Max.   :3169.0

  summary(sb_clean)

##  Name of Firm        Trade Name        Address, line 1    Address, line 2   
##  Length:6020        Length:6020        Length:6020        Length:6020       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      City              State               Zip           
##  Length:6020        Length:6020        Length:6020       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Capabilities Narrative E-mail Address     Year Established   Contact         
##  Length:6020            Length:6020        Min.   : 200     Length:6020       
##  Class :character       Class :character   1st Qu.:1991     Class :character  
##  Mode  :character       Mode  :character   Median :2003     Mode  :character  
##                                            Mean   :1998                       
##                                            3rd Qu.:2010                       
##                                            Max.   :2023                       
##                                            NA's   :3

  summary(clean_active_biz_name)

##  LOCATION ACCOUNT # BUSINESS NAME        DBA NAME         STREET ADDRESS    
##  Length:511745      Length:511745      Length:511745      Length:511745     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      CITY             ZIP CODE         LOCATION DESCRIPTION MAILING ADDRESS   
##  Length:511745      Length:511745      Length:511745        Length:511745     
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##  MAILING CITY       MAILING ZIP CODE       NAICS       
##  Length:511745      Length:511745      Min.   :115210  
##  Class :character   Class :character   1st Qu.:452000  
##  Mode  :character   Mode  :character   Median :541213  
##                                        Mean   :548421  
##                                        3rd Qu.:624200  
##                                        Max.   :999999  
##  PRIMARY NAICS DESCRIPTION COUNCIL DISTRICT LOCATION START DATE
##  Length:511745             Min.   : 0.000   Length:511745      
##  Class :character          1st Qu.: 2.000   Class :character   
##  Mode  :character          Median : 5.000   Mode  :character   
##                            Mean   : 6.425                      
##                            3rd Qu.:11.000                      
##                            Max.   :15.000                      
##    LOCATION        
##  Length:511745     
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Save the datasets to csv for the analysis step

  write.csv(demo_data, file = "lac_demographics.csv", row.names = FALSE)
  write.csv(sb_clean, file = "lac_small_business.csv", row.names = FALSE)
  write.csv(clean_active_biz_name, file = "lac_active_business.csv", row.names = FALSE)

Small_Business_Helper

2023-08-25

Initial LA County Small Business Cleaning

Install Necessary Packages

Import Packages

Load in datasets

Get summary of datasets

Cleaning the Active Business Dataset—————————————————————————————-

Clean the Active Business Dataset by extracting zip Code and remove any null/invalid zip code values

Clean Null values within NAICS description column

Clean Null Business Name

Drop the location end date, as all the values are null

Clean the Small Business Dataset ——————————————————————————————-

Clean the zip code column

Clean + Combine the unknown…4 and unknown…5 into the contact column

Remove null from: “Name of Firm”, “Capbilities Narrative”

Drop view column from Small Business dataset, essentially a row numerator

Clean California Demographics Dataset—————————————————————————————

Keep relevant columns and rename

Clean zip code column

Remove any nulls present in Demographic data rows

Check for duplicates

Creating a report for each of the datasets to check integrity

Save the datasets to csv for the analysis step