Install Necessary Packages

install.packages('tidyverse')

## Installing package into 'C:/Users/Eduardo Galvez/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Eduardo Galvez\AppData\Local\Temp\Rtmpk5J7nL\downloaded_packages

Import Packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)

Basic Analysis on the Age Demographics dataset——————————————————————————

Load in the demographics dataset

  demo <- read_csv('lac_demographics.csv', show_col_types = FALSE)
  str(demo)

## spc_tbl_ [1,769 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Zip_code      : num [1:1769] 89010 89019 89060 89061 89439 ...
##  $ est_under_5   : num [1:1769] 14 50 383 77 24 ...
##  $ est_5_to_9    : num [1:1769] 33 16 696 169 80 ...
##  $ est_10_to_14  : num [1:1769] 41 39 371 119 16 ...
##  $ est_15_to_19  : num [1:1769] 11 62 250 251 12 ...
##  $ est_20_to_24  : num [1:1769] 0 107 394 151 13 ...
##  $ est_25_to_34  : num [1:1769] 44 227 1124 501 60 ...
##  $ est_35_to_44  : num [1:1769] 74 96 740 305 132 ...
##  $ est_45_to_54  : num [1:1769] 34 278 1585 505 167 ...
##  $ est_55_to_59  : num [1:1769] 25 444 771 343 195 ...
##  $ est_60_to_64  : num [1:1769] 54 304 1144 695 167 ...
##  $ est_65_to_74  : num [1:1769] 109 400 1972 1621 153 ...
##  $ est_75_to_84  : num [1:1769] 30 39 1037 670 209 ...
##  $ est_85_over   : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
##  $ est_median_age: chr [1:1769] "48.8" "57.3" "54.0" "62.9" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Zip_code = col_double(),
##   ..   est_under_5 = col_double(),
##   ..   est_5_to_9 = col_double(),
##   ..   est_10_to_14 = col_double(),
##   ..   est_15_to_19 = col_double(),
##   ..   est_20_to_24 = col_double(),
##   ..   est_25_to_34 = col_double(),
##   ..   est_35_to_44 = col_double(),
##   ..   est_45_to_54 = col_double(),
##   ..   est_55_to_59 = col_double(),
##   ..   est_60_to_64 = col_double(),
##   ..   est_65_to_74 = col_double(),
##   ..   est_75_to_84 = col_double(),
##   ..   est_85_over = col_double(),
##   ..   est_median_age = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Convert Median column from char to double

  demo <- demo %>%
    mutate(est_median_age = as.double(est_median_age))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `est_median_age = as.double(est_median_age)`.
## Caused by warning:
## ! NAs introduced by coercion

  str(demo)

## tibble [1,769 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Zip_code      : num [1:1769] 89010 89019 89060 89061 89439 ...
##  $ est_under_5   : num [1:1769] 14 50 383 77 24 ...
##  $ est_5_to_9    : num [1:1769] 33 16 696 169 80 ...
##  $ est_10_to_14  : num [1:1769] 41 39 371 119 16 ...
##  $ est_15_to_19  : num [1:1769] 11 62 250 251 12 ...
##  $ est_20_to_24  : num [1:1769] 0 107 394 151 13 ...
##  $ est_25_to_34  : num [1:1769] 44 227 1124 501 60 ...
##  $ est_35_to_44  : num [1:1769] 74 96 740 305 132 ...
##  $ est_45_to_54  : num [1:1769] 34 278 1585 505 167 ...
##  $ est_55_to_59  : num [1:1769] 25 444 771 343 195 ...
##  $ est_60_to_64  : num [1:1769] 54 304 1144 695 167 ...
##  $ est_65_to_74  : num [1:1769] 109 400 1972 1621 153 ...
##  $ est_75_to_84  : num [1:1769] 30 39 1037 670 209 ...
##  $ est_85_over   : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
##  $ est_median_age: num [1:1769] 48.8 57.3 54 62.9 58.9 30.3 28.5 28.6 35.3 35.4 ...

  head(demo)

## # A tibble: 6 × 15
##   Zip_code est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
##      <dbl>       <dbl>      <dbl>        <dbl>        <dbl>        <dbl>
## 1    89010          14         33           41           11            0
## 2    89019          50         16           39           62          107
## 3    89060         383        696          371          250          394
## 4    89061          77        169          119          251          151
## 5    89439          24         80           16           12           13
## 6    90001        4939       4903         5364         4511         4932
## # ℹ 9 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## #   est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## #   est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## #   est_median_age <dbl>

Sum age count column to get total

  cols_to_convert <- c(
    "est_under_5", "est_5_to_9", "est_10_to_14", "est_15_to_19",
    "est_20_to_24", "est_25_to_34", "est_35_to_44", "est_45_to_54",
    "est_55_to_59", "est_60_to_64", "est_65_to_74", "est_75_to_84",
    "est_85_over"
  )

  demo[cols_to_convert] <- lapply(demo[cols_to_convert], function(x) as.numeric(x))


  demo <- demo %>%
    mutate(est_total_pop = est_under_5 + est_5_to_9 + est_10_to_14 + est_15_to_19 + est_20_to_24 + est_25_to_34 + est_35_to_44 
           +est_45_to_54 + est_55_to_59 + est_60_to_64 + est_65_to_74 + est_75_to_84 + est_85_over)

Convert the Age counts into percentage for analysis in the future

  demo <- demo %>%
    mutate(    
      est_under_5 = est_under_5 / est_total_pop,
      est_5_to_9 = est_5_to_9 / est_total_pop,
      est_10_to_14 = est_10_to_14 / est_total_pop,
      est_15_to_19 = est_15_to_19 / est_total_pop,
      est_20_to_24 = est_20_to_24 / est_total_pop,
      est_25_to_34 = est_25_to_34 / est_total_pop,
      est_35_to_44 = est_35_to_44 / est_total_pop,
      est_45_to_54 = est_45_to_54 / est_total_pop,
      est_55_to_59 = est_55_to_59 / est_total_pop,
      est_60_to_64 = est_60_to_64 / est_total_pop,
      est_65_to_74 = est_65_to_74 / est_total_pop,
      est_75_to_84 = est_75_to_84 / est_total_pop,
      est_85_over = est_85_over / est_total_pop
    )

Column distributions

  hist(demo$est_under_5, main = "Distribution of % pop aged under 5", xlab = "Age", ylab = "Frequency")

  hist(demo$est_5_to_9, main = "Distribution of % pop aged5 to 9", xlab = "Age", ylab = "Frequency")

  hist(demo$est_10_to_14, main = "Distribution of % pop aged 10 to 14", xlab = "Age", ylab = "Frequency")

  hist(demo$est_15_to_19, main = "Distribution of % pop aged 15 to 19", xlab = "Age", ylab = "Frequency")

  hist(demo$est_20_to_24, main = "Distribution of % pop aged 20 to 24", xlab = "Age", ylab = "Frequency")

  hist(demo$est_25_to_34, main = "Distribution of % pop aged 25 to 34", xlab = "Age", ylab = "Frequency")

  hist(demo$est_35_to_44, main = "Distribution of % pop aged 35 to 44", xlab = "Age", ylab = "Frequency")

  hist(demo$est_45_to_54, main = "Distribution of % pop aged 45 to 54", xlab = "Age", ylab = "Frequency")

  hist(demo$est_55_to_59, main = "Distribution of % pop aged 55 to 59", xlab = "Age", ylab = "Frequency")

  hist(demo$est_60_to_64, main = "Distribution of % pop aged 60 to 64", xlab = "Age", ylab = "Frequency")

  hist(demo$est_65_to_74, main = "Distribution of % pop aged 65 to 74", xlab = "Age", ylab = "Frequency")

  hist(demo$est_75_to_84, main = "Distribution of % pop aged 75 to 84", xlab = "Age", ylab = "Frequency")

  hist(demo$est_85_over, main = "Distribution of % pop aged 85 and over", xlab = "Age", ylab = "Frequency")

Interesting things I noticed about the % age count distributions

Under 5 age group is the smallest % of pop mostly
Age 15 to 19 and 20 to 24 have very similar distribution
25-34 appears to represent largest % of population
As is expected with normal distributions the 85 and over also makes up the smallest %

Active Business dataset basic analysis————————————————————————————–

Load in the active business dataset

  #active business dataset
  ab_data <- read_csv('lac_active_business.csv', show_col_types = FALSE)
  str(ab_data)

## spc_tbl_ [511,745 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ LOCATION ACCOUNT #       : chr [1:511745] "0000000150-0001-5" "0000000156-0001-2" "0002842614-0001-2" "0003176540-0001-1" ...
##  $ BUSINESS NAME            : chr [1:511745] "A A OFICINA CENTRAL HISPANA DE LOS ANGELES /C" "SPRINGBOARD NON-PROFIT CONSUMER CREDIT MANAGEMENT" "AUSTIN B CREEK" "SOEUNG CHAING" ...
##  $ DBA NAME                 : chr [1:511745] NA "MONEY MANAGEMENT INTERNATIONAL" NA "CAFECAFE" ...
##  $ STREET ADDRESS           : chr [1:511745] "2015 W TEMPLE STREET" "1605 W OLYMPIC BLVD #9023" "3610 W MAGNOLIA BLVD" "1330 W PACIFIC COAST HIGHWAY SUITE #E" ...
##  $ CITY                     : chr [1:511745] "LOS ANGELES" "LOS ANGELES" "BURBANK" "WILMINGTON" ...
##  $ ZIP CODE                 : chr [1:511745] "90026" "90015" "91505" "90744" ...
##  $ LOCATION DESCRIPTION     : chr [1:511745] "2015 TEMPLE 90026-4913" "1605 OLYMPIC 90015-3828" "3610 MAGNOLIA 91505-2913" "1330 PACIFIC COAST 90744-2467" ...
##  $ MAILING ADDRESS          : chr [1:511745] "2607 VAN BUREN PLACE" NA NA NA ...
##  $ MAILING CITY             : chr [1:511745] "LOS ANGELES" NA NA NA ...
##  $ MAILING ZIP CODE         : chr [1:511745] "90007-2129" NA NA NA ...
##  $ NAICS                    : num [1:511745] 611000 523900 711510 722211 561720 ...
##  $ PRIMARY NAICS DESCRIPTION: chr [1:511745] "Educational services (including schools, colleges, & universities)" "Other financial investment activities (including investment advice)" "Independent artists, writers, & performers" "Limited-service eating places" ...
##  $ COUNCIL DISTRICT         : num [1:511745] 13 1 0 15 6 11 0 11 14 0 ...
##  $ LOCATION START DATE      : chr [1:511745] "01/01/1991" "02/01/1999" "09/01/2008" "01/01/2020" ...
##  $ LOCATION                 : chr [1:511745] NA NA "(34.168, -118.3463)" "(33.7901, -118.2804)" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `LOCATION ACCOUNT #` = col_character(),
##   ..   `BUSINESS NAME` = col_character(),
##   ..   `DBA NAME` = col_character(),
##   ..   `STREET ADDRESS` = col_character(),
##   ..   CITY = col_character(),
##   ..   `ZIP CODE` = col_character(),
##   ..   `LOCATION DESCRIPTION` = col_character(),
##   ..   `MAILING ADDRESS` = col_character(),
##   ..   `MAILING CITY` = col_character(),
##   ..   `MAILING ZIP CODE` = col_character(),
##   ..   NAICS = col_double(),
##   ..   `PRIMARY NAICS DESCRIPTION` = col_character(),
##   ..   `COUNCIL DISTRICT` = col_double(),
##   ..   `LOCATION START DATE` = col_character(),
##   ..   LOCATION = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Get unique counts for the following columns: ZIP CODE, NAICS, PRIMARY NAICS DESCRIPTION

  unique_counts <- ab_data %>%
    summarise(across(everything(), n_distinct))

  for (col_name in colnames(unique_counts)) {
    cat("Column: ", col_name, "\n")
    cat("Unique Count: ", unique_counts[[col_name]], "\n\n")
  }

## Column:  LOCATION ACCOUNT # 
## Unique Count:  511745 
## 
## Column:  BUSINESS NAME 
## Unique Count:  482299 
## 
## Column:  DBA NAME 
## Unique Count:  167483 
## 
## Column:  STREET ADDRESS 
## Unique Count:  439068 
## 
## Column:  CITY 
## Unique Count:  3157 
## 
## Column:  ZIP CODE 
## Unique Count:  4815 
## 
## Column:  LOCATION DESCRIPTION 
## Unique Count:  366594 
## 
## Column:  MAILING ADDRESS 
## Unique Count:  200543 
## 
## Column:  MAILING CITY 
## Unique Count:  4546 
## 
## Column:  MAILING ZIP CODE 
## Unique Count:  149510 
## 
## Column:  NAICS 
## Unique Count:  289 
## 
## Column:  PRIMARY NAICS DESCRIPTION 
## Unique Count:  291 
## 
## Column:  COUNCIL DISTRICT 
## Unique Count:  16 
## 
## Column:  LOCATION START DATE 
## Unique Count:  15936 
## 
## Column:  LOCATION 
## Unique Count:  291380

there are 289 NAICS codes, and 291 Primary NAICS descriptions
I believe this discrepancy may be attributed to some typo in some of the descriptions

Basic analysis on small business dataset

  sb_data <- read_csv('lac_small_business.csv', show_col_types = FALSE)
  summary(sb_data)

##  Name of Firm        Trade Name        Address, line 1    Address, line 2   
##  Length:6020        Length:6020        Length:6020        Length:6020       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      City              State                Zip        Capabilities Narrative
##  Length:6020        Length:6020        Min.   :90001   Length:6020           
##  Class :character   Class :character   1st Qu.:90248   Class :character      
##  Mode  :character   Mode  :character   Median :90802   Mode  :character      
##                                        Mean   :90906                         
##                                        3rd Qu.:91355                         
##                                        Max.   :93591                         
##                                                                              
##  E-mail Address     Year Established   Contact         
##  Length:6020        Min.   : 200     Length:6020       
##  Class :character   1st Qu.:1991     Class :character  
##  Mode  :character   Median :2003     Mode  :character  
##                     Mean   :1998                       
##                     3rd Qu.:2010                       
##                     Max.   :2023                       
##                     NA's   :3

Joining Small Business Dataset with the Age demographics dataset

  sb_demo <- inner_join(sb_data, demo, by = c("Zip" = "Zip_code"))
  write.csv(sb_demo, file = "sb_demo.csv", row.names = FALSE)

Get unique column values

  sb_unique_counts <- sb_data %>%
    summarise(across(everything(), n_distinct))

  for (col_name in colnames(sb_unique_counts)) {
    cat("Column: ", col_name, "\n")
    cat("Unique Count: ", sb_unique_counts[[col_name]], "\n\n")
  }

## Column:  Name of Firm 
## Unique Count:  6017 
## 
## Column:  Trade Name 
## Unique Count:  1958 
## 
## Column:  Address, line 1 
## Unique Count:  5934 
## 
## Column:  Address, line 2 
## Unique Count:  214 
## 
## Column:  City 
## Unique Count:  274 
## 
## Column:  State 
## Unique Count:  1 
## 
## Column:  Zip 
## Unique Count:  280 
## 
## Column:  Capabilities Narrative 
## Unique Count:  5622 
## 
## Column:  E-mail Address 
## Unique Count:  5997 
## 
## Column:  Year Established 
## Unique Count:  108 
## 
## Column:  Contact 
## Unique Count:  5989

Get unique zip codes in small business, to select same zip codes in active business dataset

  unique_zips <- unique(sb_data$Zip)
  str(unique_zips)

##  num [1:280] 90277 90001 90010 90017 90011 ...

Use Zip codes from small business dataset to filter the active business dataset

  #Active Business data dimenisions before filter
  dim_before <- dim(ab_data)
  print(dim_before)

## [1] 511745     15

  ab_data <- ab_data[ab_data$`ZIP CODE` %in% unique_zips, ]
  
  #Active Business data dimenisions after filter
  dim_after <- dim(ab_data)
  print(dim_after)

## [1] 483092     15

Attemping to see if the business name can be used as an identifier

  suffix_map <- c(
    "street" = "st",
    "boulevard" = "blvd",
    "avenue" = "ave",
    "suite" = "ste",
    "road" = "rd",
    "lane" = "ln",
    "court" = "ct",
    "drive" = "dr",
    "circle" = "cir",
    "place" = "pl",
    "terrace" = "ter",
    "parkway" = "pkwy",
    "way" = "wy",
    "alley" = "aly",
    "crescent" = "cres",
    "loop" = "loop",
    "square" = "sq",
    "trail" = "trl",
    "highway" = "hwy",
    "expressway" = "expy",
    "route" = "rte",
    "freeway" = "fwy",
    "row" = "row",
    "cove" = "cove",
    "south" = "s",
    "north" = "n",
    "west" = "w",
    "east" = "e",
    "southeast" = "se",
    "southwest" = "sw",
    "northwest" = "nw",
    "northeast" = "ne",
    "apartment" = "apt"

    
  )


  replace_suffix <- function(text) {
    for (suffix in names(suffix_map)) {
      text <- gsub(suffix, suffix_map[suffix], text)
    }
    return(text)
  }




  sb_data <- sb_data %>%
    
    mutate(`Address, line 1` = tolower(`Address, line 1`) %>%
           sapply(., replace_suffix)) %>%
    mutate(sb_key = tolower(`Name of Firm`)   %>% paste0(tolower(`Trade Name`)) %>% paste0(as.double(`Zip`))  ) %>%
    mutate(sb_key = gsub("[^a-z0-9]", "", sb_key)) %>% 
    mutate(sb_zip_key = tolower(`Name of Firm`) %>% paste0(as.double(`Zip`))   ) %>% 
    mutate(sb_zip_key = gsub("[^a-z0-9]", "", sb_zip_key)) %>% 
    mutate(sb_addy = tolower(`Name of Firm`) %>% paste0(tolower(`Address, line 1`)) %>% paste0(as.double(`Zip`))    ) %>% 
    mutate(sb_addy = gsub("[^a-z0-9]", "", sb_addy))

  

  ab_data <- ab_data %>%
    mutate(`STREET ADDRESS` = tolower(`STREET ADDRESS`) %>%
           sapply(., replace_suffix)) %>%
    mutate(ab_key = tolower(`BUSINESS NAME`)  %>% paste0(tolower(`DBA NAME`)) %>% paste0(as.double(`ZIP CODE`)) ) %>%
    mutate(ab_key = gsub("[^a-z0-9]", "", ab_key)) %>% 
    mutate(ab_zip_key = tolower(`BUSINESS NAME`)  %>% paste0(as.double(`ZIP CODE`)) ) %>% 
    mutate(ab_zip_key = gsub("[^a-z0-9]", "", ab_zip_key)) %>% 
    mutate(ab_addy = tolower(`BUSINESS NAME`)  %>% paste0(tolower(`STREET ADDRESS`)) %>% paste0(as.double(`ZIP CODE`)) ) %>% 
    mutate(ab_addy = gsub("[^a-z0-9]", "", ab_addy)) 
  
  #Biz name + dba + zip key
  common_values <- intersect(sb_data$sb_key, ab_data$ab_key)
  common_count <- length(common_values)
  common_count

## [1] 580

  (common_count/ nrow(sb_data))*100

## [1] 9.634551

  #Biz name + zip key

  common_values_zip <- intersect(sb_data$sb_zip_key, ab_data$ab_zip_key)
  common_count_zip <- length(common_values_zip)
  common_count_zip

## [1] 950

  (common_count_zip/ nrow(sb_data))*100

## [1] 15.78073

  #Biz name + Address + zip key
  common_values_addy <- intersect(sb_data$sb_addy, ab_data$ab_addy)
  common_count_addy <- length(common_values_addy)
  common_count_addy

## [1] 601

  (common_count_addy/ nrow(sb_data))*100

## [1] 9.983389

It appears only 58 rows from the small business dataset that are also in the active business dataset
This is roughly 9.6% of the data in the small business dataset, which will have to do now for my analysis
Better to use the business + street address as key
This gets us closer to capturing 10% of the overlap between two datasets

Merging the small business dataset with the active business dataset

  sb_data <- sb_data %>%
    distinct(sb_addy, .keep_all = TRUE)
  ab_data <- ab_data %>%
    distinct(ab_addy, .keep_all = TRUE)


  #active small business dataset
  asb <- inner_join(sb_data, ab_data, by = c("sb_addy" = "ab_addy")) %>%
  select(`BUSINESS NAME`, `DBA NAME`, `ZIP CODE`, `BUSINESS NAME`, `LOCATION START DATE`, `LOCATION`,
         `NAICS`,`PRIMARY NAICS DESCRIPTION`,`E-mail Address`,`Year Established`,`Contact`,`STREET ADDRESS`,`CITY`)

  asb

## # A tibble: 601 × 12
##    `BUSINESS NAME`   `DBA NAME` `ZIP CODE` `LOCATION START DATE` LOCATION  NAICS
##    <chr>             <chr>      <chr>      <chr>                 <chr>     <dbl>
##  1 ELECTROLIZING INC <NA>       90011      09/01/1952            (34.021… 332000
##  2 STEMAR EQUIPMENT… <NA>       90013      08/01/1953            (34.043… 532400
##  3 PELTAR CORP       DAV-TECH … 91502      11/05/1988            (34.167… 423400
##  4 INDUSTRIAL LOCK … INDUSTRIA… 90245      04/01/1984            (33.920… 444130
##  5 BIRD BARRIER AME… <NA>       90746      10/05/1994            (33.84,… 454390
##  6 TRANSLATING SERV… LAZAR TRA… 91364      07/01/2003            (34.169… 541930
##  7 AERO MECHANISM P… <NA>       91311      03/01/1997            (34.247… 423400
##  8 PRIMARY INSTRUME… MICRO MET… 91311      03/24/1986            (34.243… 541940
##  9 W MACHINE WORKS … <NA>       91340      09/23/1980            (34.276… 332000
## 10 DOWELL ALUMINUM … <NA>       91605      08/01/1957            (34.195… 423500
## # ℹ 591 more rows
## # ℹ 6 more variables: `PRIMARY NAICS DESCRIPTION` <chr>,
## #   `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>,
## #   `STREET ADDRESS` <chr>, CITY <chr>

  asb <- asb %>%
    mutate(`ZIP CODE` = as.double(`ZIP CODE`))

Issues with merging the Small Business Dataset and Active Business Dataset

There is no unique way to connect the two datasets
As I attempt to make the unique identifier between more unique, less data overlaps
This suggests that many of the records between the two datasets are unrelated and pulled from different areas

Join the Active Small business dataset with the age demographics dataset

  asb_demo <- inner_join(asb, demo, by = c("ZIP CODE" = "Zip_code"))
  colnames(asb_demo)

##  [1] "BUSINESS NAME"             "DBA NAME"                 
##  [3] "ZIP CODE"                  "LOCATION START DATE"      
##  [5] "LOCATION"                  "NAICS"                    
##  [7] "PRIMARY NAICS DESCRIPTION" "E-mail Address"           
##  [9] "Year Established"          "Contact"                  
## [11] "STREET ADDRESS"            "CITY"                     
## [13] "est_under_5"               "est_5_to_9"               
## [15] "est_10_to_14"              "est_15_to_19"             
## [17] "est_20_to_24"              "est_25_to_34"             
## [19] "est_35_to_44"              "est_45_to_54"             
## [21] "est_55_to_59"              "est_60_to_64"             
## [23] "est_65_to_74"              "est_75_to_84"             
## [25] "est_85_over"               "est_median_age"           
## [27] "est_total_pop"

Creating Latitude and Longitude column from the Location column

  asb_demo$LOCATION <- gsub("\\(|\\)", "", asb_demo$LOCATION)

  asb_demo <- asb_demo %>% 
    separate(LOCATION, into = c("Latitude", "Longitude"), sep = ", ")

Write Active Small Business with Demographics to csv file

  write.csv(asb_demo, file = "asb_demo.csv", row.names = FALSE)

Final thoughts on how I will conduct analysis

Will be dealing with small dataset(10% of small business dataset)
Need to conduct analysis on small sample size(already done)
This is due to the importance of the NAICS which standardizes what services the business offers(understood)
Need to join other datasets from the small business database here as well such as manufacturing and r&d and construction(completed)

Active Small Business Demographics summary

  summary(asb_demo)

##  BUSINESS NAME        DBA NAME            ZIP CODE     LOCATION START DATE
##  Length:601         Length:601         Min.   :90001   Length:601         
##  Class :character   Class :character   1st Qu.:90066   Class :character   
##  Mode  :character   Mode  :character   Median :90805   Mode  :character   
##                                        Mean   :90853                      
##                                        3rd Qu.:91351                      
##                                        Max.   :93552                      
##    Latitude          Longitude             NAICS       
##  Length:601         Length:601         Min.   :115310  
##  Class :character   Class :character   1st Qu.:235930  
##  Mode  :character   Mode  :character   Median :424950  
##                                        Mean   :419374  
##                                        3rd Qu.:541370  
##                                        Max.   :999999  
##  PRIMARY NAICS DESCRIPTION E-mail Address     Year Established
##  Length:601                Length:601         Min.   :1878    
##  Class :character          Class :character   1st Qu.:1986    
##  Mode  :character          Mode  :character   Median :2000    
##                                               Mean   :1996    
##                                               3rd Qu.:2010    
##                                               Max.   :2022    
##    Contact          STREET ADDRESS         CITY            est_under_5     
##  Length:601         Length:601         Length:601         Min.   :0.00000  
##  Class :character   Class :character   Class :character   1st Qu.:0.04861  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.05606  
##                                                           Mean   :0.05588  
##                                                           3rd Qu.:0.06409  
##                                                           Max.   :0.09627  
##    est_5_to_9       est_10_to_14      est_15_to_19      est_20_to_24    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04894   1st Qu.:0.04819   1st Qu.:0.04857   1st Qu.:0.05573  
##  Median :0.05687   Median :0.05962   Median :0.06028   Median :0.06561  
##  Mean   :0.05589   Mean   :0.05892   Mean   :0.06045   Mean   :0.06961  
##  3rd Qu.:0.06308   3rd Qu.:0.06979   3rd Qu.:0.06853   3rd Qu.:0.08007  
##  Max.   :0.12388   Max.   :0.10157   Max.   :0.23557   Max.   :0.28167  
##   est_25_to_34      est_35_to_44      est_45_to_54      est_55_to_59    
##  Min.   :0.05018   Min.   :0.05258   Min.   :0.05637   Min.   :0.02960  
##  1st Qu.:0.13668   1st Qu.:0.12830   1st Qu.:0.12464   1st Qu.:0.05641  
##  Median :0.15385   Median :0.13667   Median :0.13297   Median :0.06350  
##  Mean   :0.16373   Mean   :0.13753   Mean   :0.13328   Mean   :0.06410  
##  3rd Qu.:0.17695   3rd Qu.:0.14990   3rd Qu.:0.14147   3rd Qu.:0.07222  
##  Max.   :0.39474   Max.   :0.24749   Max.   :0.26974   Max.   :0.10654  
##   est_60_to_64      est_65_to_74      est_75_to_84      est_85_over      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.01447   Min.   :0.001332  
##  1st Qu.:0.05011   1st Qu.:0.06716   1st Qu.:0.03125   1st Qu.:0.013688  
##  Median :0.05634   Median :0.07923   Median :0.03926   Median :0.019067  
##  Mean   :0.05774   Mean   :0.08024   Mean   :0.04120   Mean   :0.021438  
##  3rd Qu.:0.06601   3rd Qu.:0.09378   3rd Qu.:0.05180   3rd Qu.:0.028930  
##  Max.   :0.09363   Max.   :0.17148   Max.   :0.20391   Max.   :0.140625  
##  est_median_age  est_total_pop   
##  Min.   :22.80   Min.   :   152  
##  1st Qu.:34.80   1st Qu.: 29371  
##  Median :37.30   Median : 41811  
##  Mean   :37.62   Mean   : 43288  
##  3rd Qu.:40.90   3rd Qu.: 53637  
##  Max.   :66.50   Max.   :110750

  unique_counts <- sapply(asb_demo, function(col) length(unique(col)))
  unique_counts

##             BUSINESS NAME                  DBA NAME                  ZIP CODE 
##                       600                       181                       197 
##       LOCATION START DATE                  Latitude                 Longitude 
##                       530                       533                       541 
##                     NAICS PRIMARY NAICS DESCRIPTION            E-mail Address 
##                       129                       129                       600 
##          Year Established                   Contact            STREET ADDRESS 
##                        81                       601                       598 
##                      CITY               est_under_5                est_5_to_9 
##                       106                       197                       197 
##              est_10_to_14              est_15_to_19              est_20_to_24 
##                       197                       197                       197 
##              est_25_to_34              est_35_to_44              est_45_to_54 
##                       197                       197                       197 
##              est_55_to_59              est_60_to_64              est_65_to_74 
##                       197                       197                       197 
##              est_75_to_84               est_85_over            est_median_age 
##                       197                       197                       123 
##             est_total_pop 
##                       196

  #NAICS counts
  ns_counts <- table(asb_demo$`NAICS`)
  
  ns_counts_df <- data.frame(Name = names(ns_counts), Count = as.numeric(ns_counts))
  ns_counts_df <- ns_counts_df[order(-ns_counts_df$Count), ]
  
  # Top NAICS code is 233210(Single Family Housing Construction)
  top_ns <- head(ns_counts_df, n = 3)
  
  top_ns

##      Name Count
## 2  233210    52
## 96 541600    32
## 21 238990    21

  #NAICS description counts
  nd_counts <- table(asb_demo$`PRIMARY NAICS DESCRIPTION`)
  
  nd_counts_df <- data.frame(Name = names(nd_counts), Count = as.numeric(nd_counts))
  nd_counts_df <- nd_counts_df[order(-nd_counts_df$Count), ]
  
  # Top NAICS Description is Single Family Housing Construction
  top_nd <- head(nd_counts_df, n = 3)
  
  top_nd

##                                                        Name Count
## 110         Single Family Housing Construction (1997 NAICS)    52
## 66  Management, scientific, & technical consulting services    32
## 7                     All other specialty trade contractors    21

  #zip code counts
  zip_counts <- table(asb_demo$`ZIP CODE`)
  
  zip_counts_df <- data.frame(Name = names(zip_counts), Count = as.numeric(zip_counts))
  zip_counts_df <- zip_counts_df[order(-zip_counts_df$Count), ]
  
  # 91311 Zip Code has most small businesses based on limited data
  top_zips <- head(zip_counts_df, n = 3)
  
  top_zips

##      Name Count
## 135 91311    41
## 158 91406    14
## 92  90670    13

Correlation between population and businesses in zip code

  zip_biz_count <- sb_demo %>%
    group_by(Zip) %>%
    summarize(count = n())
  
  
  zip_biz_count <- zip_biz_count %>%
    left_join(demo, by = c("Zip" = "Zip_code"))
  zip_biz_count

## # A tibble: 276 × 17
##      Zip count est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
##    <dbl> <int>       <dbl>      <dbl>        <dbl>        <dbl>        <dbl>
##  1 90001    14      0.0837     0.0831       0.0910       0.0765       0.0836
##  2 90002     6      0.0800     0.0866       0.0944       0.0868       0.0920
##  3 90003     6      0.0831     0.0892       0.0948       0.0891       0.0708
##  4 90004    12      0.0613     0.0538       0.0491       0.0465       0.0735
##  5 90005     8      0.0535     0.0538       0.0501       0.0426       0.0670
##  6 90006    19      0.0513     0.0688       0.0590       0.0626       0.0746
##  7 90007    13      0.0277     0.0365       0.0399       0.119        0.282 
##  8 90008    13      0.0452     0.0355       0.0629       0.0475       0.0590
##  9 90010    33      0.0166     0.0579       0            0.0231       0.0725
## 10 90011     9      0.0727     0.0878       0.0853       0.0815       0.0898
## # ℹ 266 more rows
## # ℹ 10 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## #   est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## #   est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## #   est_median_age <dbl>, est_total_pop <dbl>

  correlation <- cor(zip_biz_count$count, zip_biz_count$est_total_pop)
  correlation

## [1] 0.1475194

  ggplot(zip_biz_count, aes(x = count, y = est_total_pop)) +
    geom_point() +  # Scatter plot
    geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Trend line
    geom_text(aes(x = max(count), y = max(est_total_pop),
                  label = paste("Correlation:", round(correlation, 3))),
              nudge_x = -100, nudge_y = 500, color = "blue") +  # Correlation coefficient annotation
    labs(title = "Small Business count vs Population(By Zip Code)",
         x = "Small Business Count",
         y = "Estimated Total Population") +
    theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Amount of small businesses created by year

  sb_count_year <- sb_demo %>%
    filter(!is.na(`Year Established`) & nchar(`Year Established`) >= 4) %>%
    group_by(`Year Established`) %>%
    summarize(count = n())
  

  ggplot(sb_count_year, aes(x = `Year Established`, y = count)) +
    geom_bar(stat = "identity", fill = "blue") +
    labs(title = "Small Businesses established per year in the LA county", x = "Year Established", y = "Small Business Count") +
    scale_x_continuous(breaks = seq(min(sb_count_year$`Year Established`), max(sb_count_year$`Year Established`), by = 10))

sb_analysis

2023-08-29

Install Necessary Packages

Import Packages

Basic Analysis on the Age Demographics dataset——————————————————————————

Load in the demographics dataset

Convert Median column from char to double

Sum age count column to get total

Convert the Age counts into percentage for analysis in the future

Column distributions

Interesting things I noticed about the % age count distributions

Active Business dataset basic analysis————————————————————————————–

Load in the active business dataset

Get unique counts for the following columns: ZIP CODE, NAICS, PRIMARY NAICS DESCRIPTION

Basic analysis on small business dataset

Joining Small Business Dataset with the Age demographics dataset

Get unique column values

Get unique zip codes in small business, to select same zip codes in active business dataset

Use Zip codes from small business dataset to filter the active business dataset

Attemping to see if the business name can be used as an identifier

Merging the small business dataset with the active business dataset

Issues with merging the Small Business Dataset and Active Business Dataset

Join the Active Small business dataset with the age demographics dataset

Creating Latitude and Longitude column from the Location column

Write Active Small Business with Demographics to csv file

Final thoughts on how I will conduct analysis

Active Small Business Demographics summary

Correlation between population and businesses in zip code