Install Necessary Packages

install.packages('tidyverse')
## Installing package into 'C:/Users/Eduardo Galvez/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Eduardo Galvez\AppData\Local\Temp\Rtmpk5J7nL\downloaded_packages

Import Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)

Basic Analysis on the Age Demographics dataset——————————————————————————

Load in the demographics dataset

  demo <- read_csv('lac_demographics.csv', show_col_types = FALSE)
  str(demo)
## spc_tbl_ [1,769 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Zip_code      : num [1:1769] 89010 89019 89060 89061 89439 ...
##  $ est_under_5   : num [1:1769] 14 50 383 77 24 ...
##  $ est_5_to_9    : num [1:1769] 33 16 696 169 80 ...
##  $ est_10_to_14  : num [1:1769] 41 39 371 119 16 ...
##  $ est_15_to_19  : num [1:1769] 11 62 250 251 12 ...
##  $ est_20_to_24  : num [1:1769] 0 107 394 151 13 ...
##  $ est_25_to_34  : num [1:1769] 44 227 1124 501 60 ...
##  $ est_35_to_44  : num [1:1769] 74 96 740 305 132 ...
##  $ est_45_to_54  : num [1:1769] 34 278 1585 505 167 ...
##  $ est_55_to_59  : num [1:1769] 25 444 771 343 195 ...
##  $ est_60_to_64  : num [1:1769] 54 304 1144 695 167 ...
##  $ est_65_to_74  : num [1:1769] 109 400 1972 1621 153 ...
##  $ est_75_to_84  : num [1:1769] 30 39 1037 670 209 ...
##  $ est_85_over   : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
##  $ est_median_age: chr [1:1769] "48.8" "57.3" "54.0" "62.9" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Zip_code = col_double(),
##   ..   est_under_5 = col_double(),
##   ..   est_5_to_9 = col_double(),
##   ..   est_10_to_14 = col_double(),
##   ..   est_15_to_19 = col_double(),
##   ..   est_20_to_24 = col_double(),
##   ..   est_25_to_34 = col_double(),
##   ..   est_35_to_44 = col_double(),
##   ..   est_45_to_54 = col_double(),
##   ..   est_55_to_59 = col_double(),
##   ..   est_60_to_64 = col_double(),
##   ..   est_65_to_74 = col_double(),
##   ..   est_75_to_84 = col_double(),
##   ..   est_85_over = col_double(),
##   ..   est_median_age = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Convert Median column from char to double

  demo <- demo %>%
    mutate(est_median_age = as.double(est_median_age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `est_median_age = as.double(est_median_age)`.
## Caused by warning:
## ! NAs introduced by coercion
  str(demo)
## tibble [1,769 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Zip_code      : num [1:1769] 89010 89019 89060 89061 89439 ...
##  $ est_under_5   : num [1:1769] 14 50 383 77 24 ...
##  $ est_5_to_9    : num [1:1769] 33 16 696 169 80 ...
##  $ est_10_to_14  : num [1:1769] 41 39 371 119 16 ...
##  $ est_15_to_19  : num [1:1769] 11 62 250 251 12 ...
##  $ est_20_to_24  : num [1:1769] 0 107 394 151 13 ...
##  $ est_25_to_34  : num [1:1769] 44 227 1124 501 60 ...
##  $ est_35_to_44  : num [1:1769] 74 96 740 305 132 ...
##  $ est_45_to_54  : num [1:1769] 34 278 1585 505 167 ...
##  $ est_55_to_59  : num [1:1769] 25 444 771 343 195 ...
##  $ est_60_to_64  : num [1:1769] 54 304 1144 695 167 ...
##  $ est_65_to_74  : num [1:1769] 109 400 1972 1621 153 ...
##  $ est_75_to_84  : num [1:1769] 30 39 1037 670 209 ...
##  $ est_85_over   : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
##  $ est_median_age: num [1:1769] 48.8 57.3 54 62.9 58.9 30.3 28.5 28.6 35.3 35.4 ...
  head(demo)
## # A tibble: 6 × 15
##   Zip_code est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
##      <dbl>       <dbl>      <dbl>        <dbl>        <dbl>        <dbl>
## 1    89010          14         33           41           11            0
## 2    89019          50         16           39           62          107
## 3    89060         383        696          371          250          394
## 4    89061          77        169          119          251          151
## 5    89439          24         80           16           12           13
## 6    90001        4939       4903         5364         4511         4932
## # ℹ 9 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## #   est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## #   est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## #   est_median_age <dbl>

Sum age count column to get total

  cols_to_convert <- c(
    "est_under_5", "est_5_to_9", "est_10_to_14", "est_15_to_19",
    "est_20_to_24", "est_25_to_34", "est_35_to_44", "est_45_to_54",
    "est_55_to_59", "est_60_to_64", "est_65_to_74", "est_75_to_84",
    "est_85_over"
  )

  demo[cols_to_convert] <- lapply(demo[cols_to_convert], function(x) as.numeric(x))


  demo <- demo %>%
    mutate(est_total_pop = est_under_5 + est_5_to_9 + est_10_to_14 + est_15_to_19 + est_20_to_24 + est_25_to_34 + est_35_to_44 
           +est_45_to_54 + est_55_to_59 + est_60_to_64 + est_65_to_74 + est_75_to_84 + est_85_over) 

Convert the Age counts into percentage for analysis in the future

  demo <- demo %>%
    mutate(    
      est_under_5 = est_under_5 / est_total_pop,
      est_5_to_9 = est_5_to_9 / est_total_pop,
      est_10_to_14 = est_10_to_14 / est_total_pop,
      est_15_to_19 = est_15_to_19 / est_total_pop,
      est_20_to_24 = est_20_to_24 / est_total_pop,
      est_25_to_34 = est_25_to_34 / est_total_pop,
      est_35_to_44 = est_35_to_44 / est_total_pop,
      est_45_to_54 = est_45_to_54 / est_total_pop,
      est_55_to_59 = est_55_to_59 / est_total_pop,
      est_60_to_64 = est_60_to_64 / est_total_pop,
      est_65_to_74 = est_65_to_74 / est_total_pop,
      est_75_to_84 = est_75_to_84 / est_total_pop,
      est_85_over = est_85_over / est_total_pop
    )

Column distributions

  hist(demo$est_under_5, main = "Distribution of % pop aged under 5", xlab = "Age", ylab = "Frequency")

  hist(demo$est_5_to_9, main = "Distribution of % pop aged5 to 9", xlab = "Age", ylab = "Frequency")

  hist(demo$est_10_to_14, main = "Distribution of % pop aged 10 to 14", xlab = "Age", ylab = "Frequency")

  hist(demo$est_15_to_19, main = "Distribution of % pop aged 15 to 19", xlab = "Age", ylab = "Frequency")

  hist(demo$est_20_to_24, main = "Distribution of % pop aged 20 to 24", xlab = "Age", ylab = "Frequency")

  hist(demo$est_25_to_34, main = "Distribution of % pop aged 25 to 34", xlab = "Age", ylab = "Frequency")

  hist(demo$est_35_to_44, main = "Distribution of % pop aged 35 to 44", xlab = "Age", ylab = "Frequency")

  hist(demo$est_45_to_54, main = "Distribution of % pop aged 45 to 54", xlab = "Age", ylab = "Frequency")

  hist(demo$est_55_to_59, main = "Distribution of % pop aged 55 to 59", xlab = "Age", ylab = "Frequency")

  hist(demo$est_60_to_64, main = "Distribution of % pop aged 60 to 64", xlab = "Age", ylab = "Frequency")

  hist(demo$est_65_to_74, main = "Distribution of % pop aged 65 to 74", xlab = "Age", ylab = "Frequency")

  hist(demo$est_75_to_84, main = "Distribution of % pop aged 75 to 84", xlab = "Age", ylab = "Frequency")

  hist(demo$est_85_over, main = "Distribution of % pop aged 85 and over", xlab = "Age", ylab = "Frequency")

Interesting things I noticed about the % age count distributions

  • Under 5 age group is the smallest % of pop mostly
  • Age 15 to 19 and 20 to 24 have very similar distribution
  • 25-34 appears to represent largest % of population
  • As is expected with normal distributions the 85 and over also makes up the smallest %

Active Business dataset basic analysis————————————————————————————–

Load in the active business dataset

  #active business dataset
  ab_data <- read_csv('lac_active_business.csv', show_col_types = FALSE)
  str(ab_data)
## spc_tbl_ [511,745 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ LOCATION ACCOUNT #       : chr [1:511745] "0000000150-0001-5" "0000000156-0001-2" "0002842614-0001-2" "0003176540-0001-1" ...
##  $ BUSINESS NAME            : chr [1:511745] "A A OFICINA CENTRAL HISPANA DE LOS ANGELES /C" "SPRINGBOARD NON-PROFIT CONSUMER CREDIT MANAGEMENT" "AUSTIN B CREEK" "SOEUNG CHAING" ...
##  $ DBA NAME                 : chr [1:511745] NA "MONEY MANAGEMENT INTERNATIONAL" NA "CAFECAFE" ...
##  $ STREET ADDRESS           : chr [1:511745] "2015 W TEMPLE STREET" "1605 W OLYMPIC BLVD #9023" "3610 W MAGNOLIA BLVD" "1330 W PACIFIC COAST HIGHWAY SUITE #E" ...
##  $ CITY                     : chr [1:511745] "LOS ANGELES" "LOS ANGELES" "BURBANK" "WILMINGTON" ...
##  $ ZIP CODE                 : chr [1:511745] "90026" "90015" "91505" "90744" ...
##  $ LOCATION DESCRIPTION     : chr [1:511745] "2015 TEMPLE 90026-4913" "1605 OLYMPIC 90015-3828" "3610 MAGNOLIA 91505-2913" "1330 PACIFIC COAST 90744-2467" ...
##  $ MAILING ADDRESS          : chr [1:511745] "2607 VAN BUREN PLACE" NA NA NA ...
##  $ MAILING CITY             : chr [1:511745] "LOS ANGELES" NA NA NA ...
##  $ MAILING ZIP CODE         : chr [1:511745] "90007-2129" NA NA NA ...
##  $ NAICS                    : num [1:511745] 611000 523900 711510 722211 561720 ...
##  $ PRIMARY NAICS DESCRIPTION: chr [1:511745] "Educational services (including schools, colleges, & universities)" "Other financial investment activities (including investment advice)" "Independent artists, writers, & performers" "Limited-service eating places" ...
##  $ COUNCIL DISTRICT         : num [1:511745] 13 1 0 15 6 11 0 11 14 0 ...
##  $ LOCATION START DATE      : chr [1:511745] "01/01/1991" "02/01/1999" "09/01/2008" "01/01/2020" ...
##  $ LOCATION                 : chr [1:511745] NA NA "(34.168, -118.3463)" "(33.7901, -118.2804)" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `LOCATION ACCOUNT #` = col_character(),
##   ..   `BUSINESS NAME` = col_character(),
##   ..   `DBA NAME` = col_character(),
##   ..   `STREET ADDRESS` = col_character(),
##   ..   CITY = col_character(),
##   ..   `ZIP CODE` = col_character(),
##   ..   `LOCATION DESCRIPTION` = col_character(),
##   ..   `MAILING ADDRESS` = col_character(),
##   ..   `MAILING CITY` = col_character(),
##   ..   `MAILING ZIP CODE` = col_character(),
##   ..   NAICS = col_double(),
##   ..   `PRIMARY NAICS DESCRIPTION` = col_character(),
##   ..   `COUNCIL DISTRICT` = col_double(),
##   ..   `LOCATION START DATE` = col_character(),
##   ..   LOCATION = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Get unique counts for the following columns: ZIP CODE, NAICS, PRIMARY NAICS DESCRIPTION

  unique_counts <- ab_data %>%
    summarise(across(everything(), n_distinct))

  for (col_name in colnames(unique_counts)) {
    cat("Column: ", col_name, "\n")
    cat("Unique Count: ", unique_counts[[col_name]], "\n\n")
  }
## Column:  LOCATION ACCOUNT # 
## Unique Count:  511745 
## 
## Column:  BUSINESS NAME 
## Unique Count:  482299 
## 
## Column:  DBA NAME 
## Unique Count:  167483 
## 
## Column:  STREET ADDRESS 
## Unique Count:  439068 
## 
## Column:  CITY 
## Unique Count:  3157 
## 
## Column:  ZIP CODE 
## Unique Count:  4815 
## 
## Column:  LOCATION DESCRIPTION 
## Unique Count:  366594 
## 
## Column:  MAILING ADDRESS 
## Unique Count:  200543 
## 
## Column:  MAILING CITY 
## Unique Count:  4546 
## 
## Column:  MAILING ZIP CODE 
## Unique Count:  149510 
## 
## Column:  NAICS 
## Unique Count:  289 
## 
## Column:  PRIMARY NAICS DESCRIPTION 
## Unique Count:  291 
## 
## Column:  COUNCIL DISTRICT 
## Unique Count:  16 
## 
## Column:  LOCATION START DATE 
## Unique Count:  15936 
## 
## Column:  LOCATION 
## Unique Count:  291380
  • there are 289 NAICS codes, and 291 Primary NAICS descriptions
  • I believe this discrepancy may be attributed to some typo in some of the descriptions

Basic analysis on small business dataset

  sb_data <- read_csv('lac_small_business.csv', show_col_types = FALSE)
  summary(sb_data)
##  Name of Firm        Trade Name        Address, line 1    Address, line 2   
##  Length:6020        Length:6020        Length:6020        Length:6020       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      City              State                Zip        Capabilities Narrative
##  Length:6020        Length:6020        Min.   :90001   Length:6020           
##  Class :character   Class :character   1st Qu.:90248   Class :character      
##  Mode  :character   Mode  :character   Median :90802   Mode  :character      
##                                        Mean   :90906                         
##                                        3rd Qu.:91355                         
##                                        Max.   :93591                         
##                                                                              
##  E-mail Address     Year Established   Contact         
##  Length:6020        Min.   : 200     Length:6020       
##  Class :character   1st Qu.:1991     Class :character  
##  Mode  :character   Median :2003     Mode  :character  
##                     Mean   :1998                       
##                     3rd Qu.:2010                       
##                     Max.   :2023                       
##                     NA's   :3

Joining Small Business Dataset with the Age demographics dataset

  sb_demo <- inner_join(sb_data, demo, by = c("Zip" = "Zip_code"))
  write.csv(sb_demo, file = "sb_demo.csv", row.names = FALSE)

Get unique column values

  sb_unique_counts <- sb_data %>%
    summarise(across(everything(), n_distinct))

  for (col_name in colnames(sb_unique_counts)) {
    cat("Column: ", col_name, "\n")
    cat("Unique Count: ", sb_unique_counts[[col_name]], "\n\n")
  }
## Column:  Name of Firm 
## Unique Count:  6017 
## 
## Column:  Trade Name 
## Unique Count:  1958 
## 
## Column:  Address, line 1 
## Unique Count:  5934 
## 
## Column:  Address, line 2 
## Unique Count:  214 
## 
## Column:  City 
## Unique Count:  274 
## 
## Column:  State 
## Unique Count:  1 
## 
## Column:  Zip 
## Unique Count:  280 
## 
## Column:  Capabilities Narrative 
## Unique Count:  5622 
## 
## Column:  E-mail Address 
## Unique Count:  5997 
## 
## Column:  Year Established 
## Unique Count:  108 
## 
## Column:  Contact 
## Unique Count:  5989

Get unique zip codes in small business, to select same zip codes in active business dataset

  unique_zips <- unique(sb_data$Zip)
  str(unique_zips)
##  num [1:280] 90277 90001 90010 90017 90011 ...

Use Zip codes from small business dataset to filter the active business dataset

  #Active Business data dimenisions before filter
  dim_before <- dim(ab_data)
  print(dim_before)
## [1] 511745     15
  ab_data <- ab_data[ab_data$`ZIP CODE` %in% unique_zips, ]
  
  #Active Business data dimenisions after filter
  dim_after <- dim(ab_data)
  print(dim_after)
## [1] 483092     15

Attemping to see if the business name can be used as an identifier

  suffix_map <- c(
    "street" = "st",
    "boulevard" = "blvd",
    "avenue" = "ave",
    "suite" = "ste",
    "road" = "rd",
    "lane" = "ln",
    "court" = "ct",
    "drive" = "dr",
    "circle" = "cir",
    "place" = "pl",
    "terrace" = "ter",
    "parkway" = "pkwy",
    "way" = "wy",
    "alley" = "aly",
    "crescent" = "cres",
    "loop" = "loop",
    "square" = "sq",
    "trail" = "trl",
    "highway" = "hwy",
    "expressway" = "expy",
    "route" = "rte",
    "freeway" = "fwy",
    "row" = "row",
    "cove" = "cove",
    "south" = "s",
    "north" = "n",
    "west" = "w",
    "east" = "e",
    "southeast" = "se",
    "southwest" = "sw",
    "northwest" = "nw",
    "northeast" = "ne",
    "apartment" = "apt"

    
  )


  replace_suffix <- function(text) {
    for (suffix in names(suffix_map)) {
      text <- gsub(suffix, suffix_map[suffix], text)
    }
    return(text)
  }




  sb_data <- sb_data %>%
    
    mutate(`Address, line 1` = tolower(`Address, line 1`) %>%
           sapply(., replace_suffix)) %>%
    mutate(sb_key = tolower(`Name of Firm`)   %>% paste0(tolower(`Trade Name`)) %>% paste0(as.double(`Zip`))  ) %>%
    mutate(sb_key = gsub("[^a-z0-9]", "", sb_key)) %>% 
    mutate(sb_zip_key = tolower(`Name of Firm`) %>% paste0(as.double(`Zip`))   ) %>% 
    mutate(sb_zip_key = gsub("[^a-z0-9]", "", sb_zip_key)) %>% 
    mutate(sb_addy = tolower(`Name of Firm`) %>% paste0(tolower(`Address, line 1`)) %>% paste0(as.double(`Zip`))    ) %>% 
    mutate(sb_addy = gsub("[^a-z0-9]", "", sb_addy))

  

  ab_data <- ab_data %>%
    mutate(`STREET ADDRESS` = tolower(`STREET ADDRESS`) %>%
           sapply(., replace_suffix)) %>%
    mutate(ab_key = tolower(`BUSINESS NAME`)  %>% paste0(tolower(`DBA NAME`)) %>% paste0(as.double(`ZIP CODE`)) ) %>%
    mutate(ab_key = gsub("[^a-z0-9]", "", ab_key)) %>% 
    mutate(ab_zip_key = tolower(`BUSINESS NAME`)  %>% paste0(as.double(`ZIP CODE`)) ) %>% 
    mutate(ab_zip_key = gsub("[^a-z0-9]", "", ab_zip_key)) %>% 
    mutate(ab_addy = tolower(`BUSINESS NAME`)  %>% paste0(tolower(`STREET ADDRESS`)) %>% paste0(as.double(`ZIP CODE`)) ) %>% 
    mutate(ab_addy = gsub("[^a-z0-9]", "", ab_addy)) 
  
  #Biz name + dba + zip key
  common_values <- intersect(sb_data$sb_key, ab_data$ab_key)
  common_count <- length(common_values)
  common_count
## [1] 580
  (common_count/ nrow(sb_data))*100
## [1] 9.634551
  #Biz name + zip key

  common_values_zip <- intersect(sb_data$sb_zip_key, ab_data$ab_zip_key)
  common_count_zip <- length(common_values_zip)
  common_count_zip
## [1] 950
  (common_count_zip/ nrow(sb_data))*100
## [1] 15.78073
  #Biz name + Address + zip key
  common_values_addy <- intersect(sb_data$sb_addy, ab_data$ab_addy)
  common_count_addy <- length(common_values_addy)
  common_count_addy
## [1] 601
  (common_count_addy/ nrow(sb_data))*100
## [1] 9.983389
  • It appears only 58 rows from the small business dataset that are also in the active business dataset

  • This is roughly 9.6% of the data in the small business dataset, which will have to do now for my analysis

  • Better to use the business + street address as key

  • This gets us closer to capturing 10% of the overlap between two datasets

Merging the small business dataset with the active business dataset

  sb_data <- sb_data %>%
    distinct(sb_addy, .keep_all = TRUE)
  ab_data <- ab_data %>%
    distinct(ab_addy, .keep_all = TRUE)


  #active small business dataset
  asb <- inner_join(sb_data, ab_data, by = c("sb_addy" = "ab_addy")) %>%
  select(`BUSINESS NAME`, `DBA NAME`, `ZIP CODE`, `BUSINESS NAME`, `LOCATION START DATE`, `LOCATION`,
         `NAICS`,`PRIMARY NAICS DESCRIPTION`,`E-mail Address`,`Year Established`,`Contact`,`STREET ADDRESS`,`CITY`)

  asb
## # A tibble: 601 × 12
##    `BUSINESS NAME`   `DBA NAME` `ZIP CODE` `LOCATION START DATE` LOCATION  NAICS
##    <chr>             <chr>      <chr>      <chr>                 <chr>     <dbl>
##  1 ELECTROLIZING INC <NA>       90011      09/01/1952            (34.021… 332000
##  2 STEMAR EQUIPMENT… <NA>       90013      08/01/1953            (34.043… 532400
##  3 PELTAR CORP       DAV-TECH … 91502      11/05/1988            (34.167… 423400
##  4 INDUSTRIAL LOCK … INDUSTRIA… 90245      04/01/1984            (33.920… 444130
##  5 BIRD BARRIER AME… <NA>       90746      10/05/1994            (33.84,… 454390
##  6 TRANSLATING SERV… LAZAR TRA… 91364      07/01/2003            (34.169… 541930
##  7 AERO MECHANISM P… <NA>       91311      03/01/1997            (34.247… 423400
##  8 PRIMARY INSTRUME… MICRO MET… 91311      03/24/1986            (34.243… 541940
##  9 W MACHINE WORKS … <NA>       91340      09/23/1980            (34.276… 332000
## 10 DOWELL ALUMINUM … <NA>       91605      08/01/1957            (34.195… 423500
## # ℹ 591 more rows
## # ℹ 6 more variables: `PRIMARY NAICS DESCRIPTION` <chr>,
## #   `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>,
## #   `STREET ADDRESS` <chr>, CITY <chr>
  asb <- asb %>%
    mutate(`ZIP CODE` = as.double(`ZIP CODE`))

Issues with merging the Small Business Dataset and Active Business Dataset

  • There is no unique way to connect the two datasets
  • As I attempt to make the unique identifier between more unique, less data overlaps
  • This suggests that many of the records between the two datasets are unrelated and pulled from different areas

Join the Active Small business dataset with the age demographics dataset

  asb_demo <- inner_join(asb, demo, by = c("ZIP CODE" = "Zip_code"))
  colnames(asb_demo)
##  [1] "BUSINESS NAME"             "DBA NAME"                 
##  [3] "ZIP CODE"                  "LOCATION START DATE"      
##  [5] "LOCATION"                  "NAICS"                    
##  [7] "PRIMARY NAICS DESCRIPTION" "E-mail Address"           
##  [9] "Year Established"          "Contact"                  
## [11] "STREET ADDRESS"            "CITY"                     
## [13] "est_under_5"               "est_5_to_9"               
## [15] "est_10_to_14"              "est_15_to_19"             
## [17] "est_20_to_24"              "est_25_to_34"             
## [19] "est_35_to_44"              "est_45_to_54"             
## [21] "est_55_to_59"              "est_60_to_64"             
## [23] "est_65_to_74"              "est_75_to_84"             
## [25] "est_85_over"               "est_median_age"           
## [27] "est_total_pop"

Creating Latitude and Longitude column from the Location column

  asb_demo$LOCATION <- gsub("\\(|\\)", "", asb_demo$LOCATION)

  asb_demo <- asb_demo %>% 
    separate(LOCATION, into = c("Latitude", "Longitude"), sep = ", ") 

Write Active Small Business with Demographics to csv file

  write.csv(asb_demo, file = "asb_demo.csv", row.names = FALSE)

Final thoughts on how I will conduct analysis

  • Will be dealing with small dataset(10% of small business dataset)
  • Need to conduct analysis on small sample size(already done)
  • This is due to the importance of the NAICS which standardizes what services the business offers(understood)
  • Need to join other datasets from the small business database here as well such as manufacturing and r&d and construction(completed)

Active Small Business Demographics summary

  summary(asb_demo)
##  BUSINESS NAME        DBA NAME            ZIP CODE     LOCATION START DATE
##  Length:601         Length:601         Min.   :90001   Length:601         
##  Class :character   Class :character   1st Qu.:90066   Class :character   
##  Mode  :character   Mode  :character   Median :90805   Mode  :character   
##                                        Mean   :90853                      
##                                        3rd Qu.:91351                      
##                                        Max.   :93552                      
##    Latitude          Longitude             NAICS       
##  Length:601         Length:601         Min.   :115310  
##  Class :character   Class :character   1st Qu.:235930  
##  Mode  :character   Mode  :character   Median :424950  
##                                        Mean   :419374  
##                                        3rd Qu.:541370  
##                                        Max.   :999999  
##  PRIMARY NAICS DESCRIPTION E-mail Address     Year Established
##  Length:601                Length:601         Min.   :1878    
##  Class :character          Class :character   1st Qu.:1986    
##  Mode  :character          Mode  :character   Median :2000    
##                                               Mean   :1996    
##                                               3rd Qu.:2010    
##                                               Max.   :2022    
##    Contact          STREET ADDRESS         CITY            est_under_5     
##  Length:601         Length:601         Length:601         Min.   :0.00000  
##  Class :character   Class :character   Class :character   1st Qu.:0.04861  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.05606  
##                                                           Mean   :0.05588  
##                                                           3rd Qu.:0.06409  
##                                                           Max.   :0.09627  
##    est_5_to_9       est_10_to_14      est_15_to_19      est_20_to_24    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04894   1st Qu.:0.04819   1st Qu.:0.04857   1st Qu.:0.05573  
##  Median :0.05687   Median :0.05962   Median :0.06028   Median :0.06561  
##  Mean   :0.05589   Mean   :0.05892   Mean   :0.06045   Mean   :0.06961  
##  3rd Qu.:0.06308   3rd Qu.:0.06979   3rd Qu.:0.06853   3rd Qu.:0.08007  
##  Max.   :0.12388   Max.   :0.10157   Max.   :0.23557   Max.   :0.28167  
##   est_25_to_34      est_35_to_44      est_45_to_54      est_55_to_59    
##  Min.   :0.05018   Min.   :0.05258   Min.   :0.05637   Min.   :0.02960  
##  1st Qu.:0.13668   1st Qu.:0.12830   1st Qu.:0.12464   1st Qu.:0.05641  
##  Median :0.15385   Median :0.13667   Median :0.13297   Median :0.06350  
##  Mean   :0.16373   Mean   :0.13753   Mean   :0.13328   Mean   :0.06410  
##  3rd Qu.:0.17695   3rd Qu.:0.14990   3rd Qu.:0.14147   3rd Qu.:0.07222  
##  Max.   :0.39474   Max.   :0.24749   Max.   :0.26974   Max.   :0.10654  
##   est_60_to_64      est_65_to_74      est_75_to_84      est_85_over      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.01447   Min.   :0.001332  
##  1st Qu.:0.05011   1st Qu.:0.06716   1st Qu.:0.03125   1st Qu.:0.013688  
##  Median :0.05634   Median :0.07923   Median :0.03926   Median :0.019067  
##  Mean   :0.05774   Mean   :0.08024   Mean   :0.04120   Mean   :0.021438  
##  3rd Qu.:0.06601   3rd Qu.:0.09378   3rd Qu.:0.05180   3rd Qu.:0.028930  
##  Max.   :0.09363   Max.   :0.17148   Max.   :0.20391   Max.   :0.140625  
##  est_median_age  est_total_pop   
##  Min.   :22.80   Min.   :   152  
##  1st Qu.:34.80   1st Qu.: 29371  
##  Median :37.30   Median : 41811  
##  Mean   :37.62   Mean   : 43288  
##  3rd Qu.:40.90   3rd Qu.: 53637  
##  Max.   :66.50   Max.   :110750
  unique_counts <- sapply(asb_demo, function(col) length(unique(col)))
  unique_counts
##             BUSINESS NAME                  DBA NAME                  ZIP CODE 
##                       600                       181                       197 
##       LOCATION START DATE                  Latitude                 Longitude 
##                       530                       533                       541 
##                     NAICS PRIMARY NAICS DESCRIPTION            E-mail Address 
##                       129                       129                       600 
##          Year Established                   Contact            STREET ADDRESS 
##                        81                       601                       598 
##                      CITY               est_under_5                est_5_to_9 
##                       106                       197                       197 
##              est_10_to_14              est_15_to_19              est_20_to_24 
##                       197                       197                       197 
##              est_25_to_34              est_35_to_44              est_45_to_54 
##                       197                       197                       197 
##              est_55_to_59              est_60_to_64              est_65_to_74 
##                       197                       197                       197 
##              est_75_to_84               est_85_over            est_median_age 
##                       197                       197                       123 
##             est_total_pop 
##                       196
  #NAICS counts
  ns_counts <- table(asb_demo$`NAICS`)
  
  ns_counts_df <- data.frame(Name = names(ns_counts), Count = as.numeric(ns_counts))
  ns_counts_df <- ns_counts_df[order(-ns_counts_df$Count), ]
  
  # Top NAICS code is 233210(Single Family Housing Construction)
  top_ns <- head(ns_counts_df, n = 3)
  
  top_ns
##      Name Count
## 2  233210    52
## 96 541600    32
## 21 238990    21
  #NAICS description counts
  nd_counts <- table(asb_demo$`PRIMARY NAICS DESCRIPTION`)
  
  nd_counts_df <- data.frame(Name = names(nd_counts), Count = as.numeric(nd_counts))
  nd_counts_df <- nd_counts_df[order(-nd_counts_df$Count), ]
  
  # Top NAICS Description is Single Family Housing Construction
  top_nd <- head(nd_counts_df, n = 3)
  
  top_nd
##                                                        Name Count
## 110         Single Family Housing Construction (1997 NAICS)    52
## 66  Management, scientific, & technical consulting services    32
## 7                     All other specialty trade contractors    21
  #zip code counts
  zip_counts <- table(asb_demo$`ZIP CODE`)
  
  zip_counts_df <- data.frame(Name = names(zip_counts), Count = as.numeric(zip_counts))
  zip_counts_df <- zip_counts_df[order(-zip_counts_df$Count), ]
  
  # 91311 Zip Code has most small businesses based on limited data
  top_zips <- head(zip_counts_df, n = 3)
  
  top_zips
##      Name Count
## 135 91311    41
## 158 91406    14
## 92  90670    13

Correlation between population and businesses in zip code

  zip_biz_count <- sb_demo %>%
    group_by(Zip) %>%
    summarize(count = n())
  
  
  zip_biz_count <- zip_biz_count %>%
    left_join(demo, by = c("Zip" = "Zip_code"))
  zip_biz_count
## # A tibble: 276 × 17
##      Zip count est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
##    <dbl> <int>       <dbl>      <dbl>        <dbl>        <dbl>        <dbl>
##  1 90001    14      0.0837     0.0831       0.0910       0.0765       0.0836
##  2 90002     6      0.0800     0.0866       0.0944       0.0868       0.0920
##  3 90003     6      0.0831     0.0892       0.0948       0.0891       0.0708
##  4 90004    12      0.0613     0.0538       0.0491       0.0465       0.0735
##  5 90005     8      0.0535     0.0538       0.0501       0.0426       0.0670
##  6 90006    19      0.0513     0.0688       0.0590       0.0626       0.0746
##  7 90007    13      0.0277     0.0365       0.0399       0.119        0.282 
##  8 90008    13      0.0452     0.0355       0.0629       0.0475       0.0590
##  9 90010    33      0.0166     0.0579       0            0.0231       0.0725
## 10 90011     9      0.0727     0.0878       0.0853       0.0815       0.0898
## # ℹ 266 more rows
## # ℹ 10 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## #   est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## #   est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## #   est_median_age <dbl>, est_total_pop <dbl>
  correlation <- cor(zip_biz_count$count, zip_biz_count$est_total_pop)
  correlation
## [1] 0.1475194
  ggplot(zip_biz_count, aes(x = count, y = est_total_pop)) +
    geom_point() +  # Scatter plot
    geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Trend line
    geom_text(aes(x = max(count), y = max(est_total_pop),
                  label = paste("Correlation:", round(correlation, 3))),
              nudge_x = -100, nudge_y = 500, color = "blue") +  # Correlation coefficient annotation
    labs(title = "Small Business count vs Population(By Zip Code)",
         x = "Small Business Count",
         y = "Estimated Total Population") +
    theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Amount of small businesses created by year

  sb_count_year <- sb_demo %>%
    filter(!is.na(`Year Established`) & nchar(`Year Established`) >= 4) %>%
    group_by(`Year Established`) %>%
    summarize(count = n())
  

  ggplot(sb_count_year, aes(x = `Year Established`, y = count)) +
    geom_bar(stat = "identity", fill = "blue") +
    labs(title = "Small Businesses established per year in the LA county", x = "Year Established", y = "Small Business Count") +
    scale_x_continuous(breaks = seq(min(sb_count_year$`Year Established`), max(sb_count_year$`Year Established`), by = 10))