install.packages('tidyverse')
## Installing package into 'C:/Users/Eduardo Galvez/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Eduardo Galvez\AppData\Local\Temp\Rtmpk5J7nL\downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
demo <- read_csv('lac_demographics.csv', show_col_types = FALSE)
str(demo)
## spc_tbl_ [1,769 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Zip_code : num [1:1769] 89010 89019 89060 89061 89439 ...
## $ est_under_5 : num [1:1769] 14 50 383 77 24 ...
## $ est_5_to_9 : num [1:1769] 33 16 696 169 80 ...
## $ est_10_to_14 : num [1:1769] 41 39 371 119 16 ...
## $ est_15_to_19 : num [1:1769] 11 62 250 251 12 ...
## $ est_20_to_24 : num [1:1769] 0 107 394 151 13 ...
## $ est_25_to_34 : num [1:1769] 44 227 1124 501 60 ...
## $ est_35_to_44 : num [1:1769] 74 96 740 305 132 ...
## $ est_45_to_54 : num [1:1769] 34 278 1585 505 167 ...
## $ est_55_to_59 : num [1:1769] 25 444 771 343 195 ...
## $ est_60_to_64 : num [1:1769] 54 304 1144 695 167 ...
## $ est_65_to_74 : num [1:1769] 109 400 1972 1621 153 ...
## $ est_75_to_84 : num [1:1769] 30 39 1037 670 209 ...
## $ est_85_over : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
## $ est_median_age: chr [1:1769] "48.8" "57.3" "54.0" "62.9" ...
## - attr(*, "spec")=
## .. cols(
## .. Zip_code = col_double(),
## .. est_under_5 = col_double(),
## .. est_5_to_9 = col_double(),
## .. est_10_to_14 = col_double(),
## .. est_15_to_19 = col_double(),
## .. est_20_to_24 = col_double(),
## .. est_25_to_34 = col_double(),
## .. est_35_to_44 = col_double(),
## .. est_45_to_54 = col_double(),
## .. est_55_to_59 = col_double(),
## .. est_60_to_64 = col_double(),
## .. est_65_to_74 = col_double(),
## .. est_75_to_84 = col_double(),
## .. est_85_over = col_double(),
## .. est_median_age = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
demo <- demo %>%
mutate(est_median_age = as.double(est_median_age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `est_median_age = as.double(est_median_age)`.
## Caused by warning:
## ! NAs introduced by coercion
str(demo)
## tibble [1,769 × 15] (S3: tbl_df/tbl/data.frame)
## $ Zip_code : num [1:1769] 89010 89019 89060 89061 89439 ...
## $ est_under_5 : num [1:1769] 14 50 383 77 24 ...
## $ est_5_to_9 : num [1:1769] 33 16 696 169 80 ...
## $ est_10_to_14 : num [1:1769] 41 39 371 119 16 ...
## $ est_15_to_19 : num [1:1769] 11 62 250 251 12 ...
## $ est_20_to_24 : num [1:1769] 0 107 394 151 13 ...
## $ est_25_to_34 : num [1:1769] 44 227 1124 501 60 ...
## $ est_35_to_44 : num [1:1769] 74 96 740 305 132 ...
## $ est_45_to_54 : num [1:1769] 34 278 1585 505 167 ...
## $ est_55_to_59 : num [1:1769] 25 444 771 343 195 ...
## $ est_60_to_64 : num [1:1769] 54 304 1144 695 167 ...
## $ est_65_to_74 : num [1:1769] 109 400 1972 1621 153 ...
## $ est_75_to_84 : num [1:1769] 30 39 1037 670 209 ...
## $ est_85_over : num [1:1769] 0 6 225 130 16 482 338 795 670 753 ...
## $ est_median_age: num [1:1769] 48.8 57.3 54 62.9 58.9 30.3 28.5 28.6 35.3 35.4 ...
head(demo)
## # A tibble: 6 × 15
## Zip_code est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 89010 14 33 41 11 0
## 2 89019 50 16 39 62 107
## 3 89060 383 696 371 250 394
## 4 89061 77 169 119 251 151
## 5 89439 24 80 16 12 13
## 6 90001 4939 4903 5364 4511 4932
## # ℹ 9 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## # est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## # est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## # est_median_age <dbl>
cols_to_convert <- c(
"est_under_5", "est_5_to_9", "est_10_to_14", "est_15_to_19",
"est_20_to_24", "est_25_to_34", "est_35_to_44", "est_45_to_54",
"est_55_to_59", "est_60_to_64", "est_65_to_74", "est_75_to_84",
"est_85_over"
)
demo[cols_to_convert] <- lapply(demo[cols_to_convert], function(x) as.numeric(x))
demo <- demo %>%
mutate(est_total_pop = est_under_5 + est_5_to_9 + est_10_to_14 + est_15_to_19 + est_20_to_24 + est_25_to_34 + est_35_to_44
+est_45_to_54 + est_55_to_59 + est_60_to_64 + est_65_to_74 + est_75_to_84 + est_85_over)
demo <- demo %>%
mutate(
est_under_5 = est_under_5 / est_total_pop,
est_5_to_9 = est_5_to_9 / est_total_pop,
est_10_to_14 = est_10_to_14 / est_total_pop,
est_15_to_19 = est_15_to_19 / est_total_pop,
est_20_to_24 = est_20_to_24 / est_total_pop,
est_25_to_34 = est_25_to_34 / est_total_pop,
est_35_to_44 = est_35_to_44 / est_total_pop,
est_45_to_54 = est_45_to_54 / est_total_pop,
est_55_to_59 = est_55_to_59 / est_total_pop,
est_60_to_64 = est_60_to_64 / est_total_pop,
est_65_to_74 = est_65_to_74 / est_total_pop,
est_75_to_84 = est_75_to_84 / est_total_pop,
est_85_over = est_85_over / est_total_pop
)
hist(demo$est_under_5, main = "Distribution of % pop aged under 5", xlab = "Age", ylab = "Frequency")
hist(demo$est_5_to_9, main = "Distribution of % pop aged5 to 9", xlab = "Age", ylab = "Frequency")
hist(demo$est_10_to_14, main = "Distribution of % pop aged 10 to 14", xlab = "Age", ylab = "Frequency")
hist(demo$est_15_to_19, main = "Distribution of % pop aged 15 to 19", xlab = "Age", ylab = "Frequency")
hist(demo$est_20_to_24, main = "Distribution of % pop aged 20 to 24", xlab = "Age", ylab = "Frequency")
hist(demo$est_25_to_34, main = "Distribution of % pop aged 25 to 34", xlab = "Age", ylab = "Frequency")
hist(demo$est_35_to_44, main = "Distribution of % pop aged 35 to 44", xlab = "Age", ylab = "Frequency")
hist(demo$est_45_to_54, main = "Distribution of % pop aged 45 to 54", xlab = "Age", ylab = "Frequency")
hist(demo$est_55_to_59, main = "Distribution of % pop aged 55 to 59", xlab = "Age", ylab = "Frequency")
hist(demo$est_60_to_64, main = "Distribution of % pop aged 60 to 64", xlab = "Age", ylab = "Frequency")
hist(demo$est_65_to_74, main = "Distribution of % pop aged 65 to 74", xlab = "Age", ylab = "Frequency")
hist(demo$est_75_to_84, main = "Distribution of % pop aged 75 to 84", xlab = "Age", ylab = "Frequency")
hist(demo$est_85_over, main = "Distribution of % pop aged 85 and over", xlab = "Age", ylab = "Frequency")
#active business dataset
ab_data <- read_csv('lac_active_business.csv', show_col_types = FALSE)
str(ab_data)
## spc_tbl_ [511,745 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ LOCATION ACCOUNT # : chr [1:511745] "0000000150-0001-5" "0000000156-0001-2" "0002842614-0001-2" "0003176540-0001-1" ...
## $ BUSINESS NAME : chr [1:511745] "A A OFICINA CENTRAL HISPANA DE LOS ANGELES /C" "SPRINGBOARD NON-PROFIT CONSUMER CREDIT MANAGEMENT" "AUSTIN B CREEK" "SOEUNG CHAING" ...
## $ DBA NAME : chr [1:511745] NA "MONEY MANAGEMENT INTERNATIONAL" NA "CAFECAFE" ...
## $ STREET ADDRESS : chr [1:511745] "2015 W TEMPLE STREET" "1605 W OLYMPIC BLVD #9023" "3610 W MAGNOLIA BLVD" "1330 W PACIFIC COAST HIGHWAY SUITE #E" ...
## $ CITY : chr [1:511745] "LOS ANGELES" "LOS ANGELES" "BURBANK" "WILMINGTON" ...
## $ ZIP CODE : chr [1:511745] "90026" "90015" "91505" "90744" ...
## $ LOCATION DESCRIPTION : chr [1:511745] "2015 TEMPLE 90026-4913" "1605 OLYMPIC 90015-3828" "3610 MAGNOLIA 91505-2913" "1330 PACIFIC COAST 90744-2467" ...
## $ MAILING ADDRESS : chr [1:511745] "2607 VAN BUREN PLACE" NA NA NA ...
## $ MAILING CITY : chr [1:511745] "LOS ANGELES" NA NA NA ...
## $ MAILING ZIP CODE : chr [1:511745] "90007-2129" NA NA NA ...
## $ NAICS : num [1:511745] 611000 523900 711510 722211 561720 ...
## $ PRIMARY NAICS DESCRIPTION: chr [1:511745] "Educational services (including schools, colleges, & universities)" "Other financial investment activities (including investment advice)" "Independent artists, writers, & performers" "Limited-service eating places" ...
## $ COUNCIL DISTRICT : num [1:511745] 13 1 0 15 6 11 0 11 14 0 ...
## $ LOCATION START DATE : chr [1:511745] "01/01/1991" "02/01/1999" "09/01/2008" "01/01/2020" ...
## $ LOCATION : chr [1:511745] NA NA "(34.168, -118.3463)" "(33.7901, -118.2804)" ...
## - attr(*, "spec")=
## .. cols(
## .. `LOCATION ACCOUNT #` = col_character(),
## .. `BUSINESS NAME` = col_character(),
## .. `DBA NAME` = col_character(),
## .. `STREET ADDRESS` = col_character(),
## .. CITY = col_character(),
## .. `ZIP CODE` = col_character(),
## .. `LOCATION DESCRIPTION` = col_character(),
## .. `MAILING ADDRESS` = col_character(),
## .. `MAILING CITY` = col_character(),
## .. `MAILING ZIP CODE` = col_character(),
## .. NAICS = col_double(),
## .. `PRIMARY NAICS DESCRIPTION` = col_character(),
## .. `COUNCIL DISTRICT` = col_double(),
## .. `LOCATION START DATE` = col_character(),
## .. LOCATION = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
unique_counts <- ab_data %>%
summarise(across(everything(), n_distinct))
for (col_name in colnames(unique_counts)) {
cat("Column: ", col_name, "\n")
cat("Unique Count: ", unique_counts[[col_name]], "\n\n")
}
## Column: LOCATION ACCOUNT #
## Unique Count: 511745
##
## Column: BUSINESS NAME
## Unique Count: 482299
##
## Column: DBA NAME
## Unique Count: 167483
##
## Column: STREET ADDRESS
## Unique Count: 439068
##
## Column: CITY
## Unique Count: 3157
##
## Column: ZIP CODE
## Unique Count: 4815
##
## Column: LOCATION DESCRIPTION
## Unique Count: 366594
##
## Column: MAILING ADDRESS
## Unique Count: 200543
##
## Column: MAILING CITY
## Unique Count: 4546
##
## Column: MAILING ZIP CODE
## Unique Count: 149510
##
## Column: NAICS
## Unique Count: 289
##
## Column: PRIMARY NAICS DESCRIPTION
## Unique Count: 291
##
## Column: COUNCIL DISTRICT
## Unique Count: 16
##
## Column: LOCATION START DATE
## Unique Count: 15936
##
## Column: LOCATION
## Unique Count: 291380
sb_data <- read_csv('lac_small_business.csv', show_col_types = FALSE)
summary(sb_data)
## Name of Firm Trade Name Address, line 1 Address, line 2
## Length:6020 Length:6020 Length:6020 Length:6020
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## City State Zip Capabilities Narrative
## Length:6020 Length:6020 Min. :90001 Length:6020
## Class :character Class :character 1st Qu.:90248 Class :character
## Mode :character Mode :character Median :90802 Mode :character
## Mean :90906
## 3rd Qu.:91355
## Max. :93591
##
## E-mail Address Year Established Contact
## Length:6020 Min. : 200 Length:6020
## Class :character 1st Qu.:1991 Class :character
## Mode :character Median :2003 Mode :character
## Mean :1998
## 3rd Qu.:2010
## Max. :2023
## NA's :3
sb_demo <- inner_join(sb_data, demo, by = c("Zip" = "Zip_code"))
write.csv(sb_demo, file = "sb_demo.csv", row.names = FALSE)
sb_unique_counts <- sb_data %>%
summarise(across(everything(), n_distinct))
for (col_name in colnames(sb_unique_counts)) {
cat("Column: ", col_name, "\n")
cat("Unique Count: ", sb_unique_counts[[col_name]], "\n\n")
}
## Column: Name of Firm
## Unique Count: 6017
##
## Column: Trade Name
## Unique Count: 1958
##
## Column: Address, line 1
## Unique Count: 5934
##
## Column: Address, line 2
## Unique Count: 214
##
## Column: City
## Unique Count: 274
##
## Column: State
## Unique Count: 1
##
## Column: Zip
## Unique Count: 280
##
## Column: Capabilities Narrative
## Unique Count: 5622
##
## Column: E-mail Address
## Unique Count: 5997
##
## Column: Year Established
## Unique Count: 108
##
## Column: Contact
## Unique Count: 5989
unique_zips <- unique(sb_data$Zip)
str(unique_zips)
## num [1:280] 90277 90001 90010 90017 90011 ...
#Active Business data dimenisions before filter
dim_before <- dim(ab_data)
print(dim_before)
## [1] 511745 15
ab_data <- ab_data[ab_data$`ZIP CODE` %in% unique_zips, ]
#Active Business data dimenisions after filter
dim_after <- dim(ab_data)
print(dim_after)
## [1] 483092 15
suffix_map <- c(
"street" = "st",
"boulevard" = "blvd",
"avenue" = "ave",
"suite" = "ste",
"road" = "rd",
"lane" = "ln",
"court" = "ct",
"drive" = "dr",
"circle" = "cir",
"place" = "pl",
"terrace" = "ter",
"parkway" = "pkwy",
"way" = "wy",
"alley" = "aly",
"crescent" = "cres",
"loop" = "loop",
"square" = "sq",
"trail" = "trl",
"highway" = "hwy",
"expressway" = "expy",
"route" = "rte",
"freeway" = "fwy",
"row" = "row",
"cove" = "cove",
"south" = "s",
"north" = "n",
"west" = "w",
"east" = "e",
"southeast" = "se",
"southwest" = "sw",
"northwest" = "nw",
"northeast" = "ne",
"apartment" = "apt"
)
replace_suffix <- function(text) {
for (suffix in names(suffix_map)) {
text <- gsub(suffix, suffix_map[suffix], text)
}
return(text)
}
sb_data <- sb_data %>%
mutate(`Address, line 1` = tolower(`Address, line 1`) %>%
sapply(., replace_suffix)) %>%
mutate(sb_key = tolower(`Name of Firm`) %>% paste0(tolower(`Trade Name`)) %>% paste0(as.double(`Zip`)) ) %>%
mutate(sb_key = gsub("[^a-z0-9]", "", sb_key)) %>%
mutate(sb_zip_key = tolower(`Name of Firm`) %>% paste0(as.double(`Zip`)) ) %>%
mutate(sb_zip_key = gsub("[^a-z0-9]", "", sb_zip_key)) %>%
mutate(sb_addy = tolower(`Name of Firm`) %>% paste0(tolower(`Address, line 1`)) %>% paste0(as.double(`Zip`)) ) %>%
mutate(sb_addy = gsub("[^a-z0-9]", "", sb_addy))
ab_data <- ab_data %>%
mutate(`STREET ADDRESS` = tolower(`STREET ADDRESS`) %>%
sapply(., replace_suffix)) %>%
mutate(ab_key = tolower(`BUSINESS NAME`) %>% paste0(tolower(`DBA NAME`)) %>% paste0(as.double(`ZIP CODE`)) ) %>%
mutate(ab_key = gsub("[^a-z0-9]", "", ab_key)) %>%
mutate(ab_zip_key = tolower(`BUSINESS NAME`) %>% paste0(as.double(`ZIP CODE`)) ) %>%
mutate(ab_zip_key = gsub("[^a-z0-9]", "", ab_zip_key)) %>%
mutate(ab_addy = tolower(`BUSINESS NAME`) %>% paste0(tolower(`STREET ADDRESS`)) %>% paste0(as.double(`ZIP CODE`)) ) %>%
mutate(ab_addy = gsub("[^a-z0-9]", "", ab_addy))
#Biz name + dba + zip key
common_values <- intersect(sb_data$sb_key, ab_data$ab_key)
common_count <- length(common_values)
common_count
## [1] 580
(common_count/ nrow(sb_data))*100
## [1] 9.634551
#Biz name + zip key
common_values_zip <- intersect(sb_data$sb_zip_key, ab_data$ab_zip_key)
common_count_zip <- length(common_values_zip)
common_count_zip
## [1] 950
(common_count_zip/ nrow(sb_data))*100
## [1] 15.78073
#Biz name + Address + zip key
common_values_addy <- intersect(sb_data$sb_addy, ab_data$ab_addy)
common_count_addy <- length(common_values_addy)
common_count_addy
## [1] 601
(common_count_addy/ nrow(sb_data))*100
## [1] 9.983389
It appears only 58 rows from the small business dataset that are also in the active business dataset
This is roughly 9.6% of the data in the small business dataset, which will have to do now for my analysis
Better to use the business + street address as key
This gets us closer to capturing 10% of the overlap between two datasets
sb_data <- sb_data %>%
distinct(sb_addy, .keep_all = TRUE)
ab_data <- ab_data %>%
distinct(ab_addy, .keep_all = TRUE)
#active small business dataset
asb <- inner_join(sb_data, ab_data, by = c("sb_addy" = "ab_addy")) %>%
select(`BUSINESS NAME`, `DBA NAME`, `ZIP CODE`, `BUSINESS NAME`, `LOCATION START DATE`, `LOCATION`,
`NAICS`,`PRIMARY NAICS DESCRIPTION`,`E-mail Address`,`Year Established`,`Contact`,`STREET ADDRESS`,`CITY`)
asb
## # A tibble: 601 × 12
## `BUSINESS NAME` `DBA NAME` `ZIP CODE` `LOCATION START DATE` LOCATION NAICS
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 ELECTROLIZING INC <NA> 90011 09/01/1952 (34.021… 332000
## 2 STEMAR EQUIPMENT… <NA> 90013 08/01/1953 (34.043… 532400
## 3 PELTAR CORP DAV-TECH … 91502 11/05/1988 (34.167… 423400
## 4 INDUSTRIAL LOCK … INDUSTRIA… 90245 04/01/1984 (33.920… 444130
## 5 BIRD BARRIER AME… <NA> 90746 10/05/1994 (33.84,… 454390
## 6 TRANSLATING SERV… LAZAR TRA… 91364 07/01/2003 (34.169… 541930
## 7 AERO MECHANISM P… <NA> 91311 03/01/1997 (34.247… 423400
## 8 PRIMARY INSTRUME… MICRO MET… 91311 03/24/1986 (34.243… 541940
## 9 W MACHINE WORKS … <NA> 91340 09/23/1980 (34.276… 332000
## 10 DOWELL ALUMINUM … <NA> 91605 08/01/1957 (34.195… 423500
## # ℹ 591 more rows
## # ℹ 6 more variables: `PRIMARY NAICS DESCRIPTION` <chr>,
## # `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>,
## # `STREET ADDRESS` <chr>, CITY <chr>
asb <- asb %>%
mutate(`ZIP CODE` = as.double(`ZIP CODE`))
asb_demo <- inner_join(asb, demo, by = c("ZIP CODE" = "Zip_code"))
colnames(asb_demo)
## [1] "BUSINESS NAME" "DBA NAME"
## [3] "ZIP CODE" "LOCATION START DATE"
## [5] "LOCATION" "NAICS"
## [7] "PRIMARY NAICS DESCRIPTION" "E-mail Address"
## [9] "Year Established" "Contact"
## [11] "STREET ADDRESS" "CITY"
## [13] "est_under_5" "est_5_to_9"
## [15] "est_10_to_14" "est_15_to_19"
## [17] "est_20_to_24" "est_25_to_34"
## [19] "est_35_to_44" "est_45_to_54"
## [21] "est_55_to_59" "est_60_to_64"
## [23] "est_65_to_74" "est_75_to_84"
## [25] "est_85_over" "est_median_age"
## [27] "est_total_pop"
asb_demo$LOCATION <- gsub("\\(|\\)", "", asb_demo$LOCATION)
asb_demo <- asb_demo %>%
separate(LOCATION, into = c("Latitude", "Longitude"), sep = ", ")
write.csv(asb_demo, file = "asb_demo.csv", row.names = FALSE)
summary(asb_demo)
## BUSINESS NAME DBA NAME ZIP CODE LOCATION START DATE
## Length:601 Length:601 Min. :90001 Length:601
## Class :character Class :character 1st Qu.:90066 Class :character
## Mode :character Mode :character Median :90805 Mode :character
## Mean :90853
## 3rd Qu.:91351
## Max. :93552
## Latitude Longitude NAICS
## Length:601 Length:601 Min. :115310
## Class :character Class :character 1st Qu.:235930
## Mode :character Mode :character Median :424950
## Mean :419374
## 3rd Qu.:541370
## Max. :999999
## PRIMARY NAICS DESCRIPTION E-mail Address Year Established
## Length:601 Length:601 Min. :1878
## Class :character Class :character 1st Qu.:1986
## Mode :character Mode :character Median :2000
## Mean :1996
## 3rd Qu.:2010
## Max. :2022
## Contact STREET ADDRESS CITY est_under_5
## Length:601 Length:601 Length:601 Min. :0.00000
## Class :character Class :character Class :character 1st Qu.:0.04861
## Mode :character Mode :character Mode :character Median :0.05606
## Mean :0.05588
## 3rd Qu.:0.06409
## Max. :0.09627
## est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.04894 1st Qu.:0.04819 1st Qu.:0.04857 1st Qu.:0.05573
## Median :0.05687 Median :0.05962 Median :0.06028 Median :0.06561
## Mean :0.05589 Mean :0.05892 Mean :0.06045 Mean :0.06961
## 3rd Qu.:0.06308 3rd Qu.:0.06979 3rd Qu.:0.06853 3rd Qu.:0.08007
## Max. :0.12388 Max. :0.10157 Max. :0.23557 Max. :0.28167
## est_25_to_34 est_35_to_44 est_45_to_54 est_55_to_59
## Min. :0.05018 Min. :0.05258 Min. :0.05637 Min. :0.02960
## 1st Qu.:0.13668 1st Qu.:0.12830 1st Qu.:0.12464 1st Qu.:0.05641
## Median :0.15385 Median :0.13667 Median :0.13297 Median :0.06350
## Mean :0.16373 Mean :0.13753 Mean :0.13328 Mean :0.06410
## 3rd Qu.:0.17695 3rd Qu.:0.14990 3rd Qu.:0.14147 3rd Qu.:0.07222
## Max. :0.39474 Max. :0.24749 Max. :0.26974 Max. :0.10654
## est_60_to_64 est_65_to_74 est_75_to_84 est_85_over
## Min. :0.00000 Min. :0.00000 Min. :0.01447 Min. :0.001332
## 1st Qu.:0.05011 1st Qu.:0.06716 1st Qu.:0.03125 1st Qu.:0.013688
## Median :0.05634 Median :0.07923 Median :0.03926 Median :0.019067
## Mean :0.05774 Mean :0.08024 Mean :0.04120 Mean :0.021438
## 3rd Qu.:0.06601 3rd Qu.:0.09378 3rd Qu.:0.05180 3rd Qu.:0.028930
## Max. :0.09363 Max. :0.17148 Max. :0.20391 Max. :0.140625
## est_median_age est_total_pop
## Min. :22.80 Min. : 152
## 1st Qu.:34.80 1st Qu.: 29371
## Median :37.30 Median : 41811
## Mean :37.62 Mean : 43288
## 3rd Qu.:40.90 3rd Qu.: 53637
## Max. :66.50 Max. :110750
unique_counts <- sapply(asb_demo, function(col) length(unique(col)))
unique_counts
## BUSINESS NAME DBA NAME ZIP CODE
## 600 181 197
## LOCATION START DATE Latitude Longitude
## 530 533 541
## NAICS PRIMARY NAICS DESCRIPTION E-mail Address
## 129 129 600
## Year Established Contact STREET ADDRESS
## 81 601 598
## CITY est_under_5 est_5_to_9
## 106 197 197
## est_10_to_14 est_15_to_19 est_20_to_24
## 197 197 197
## est_25_to_34 est_35_to_44 est_45_to_54
## 197 197 197
## est_55_to_59 est_60_to_64 est_65_to_74
## 197 197 197
## est_75_to_84 est_85_over est_median_age
## 197 197 123
## est_total_pop
## 196
#NAICS counts
ns_counts <- table(asb_demo$`NAICS`)
ns_counts_df <- data.frame(Name = names(ns_counts), Count = as.numeric(ns_counts))
ns_counts_df <- ns_counts_df[order(-ns_counts_df$Count), ]
# Top NAICS code is 233210(Single Family Housing Construction)
top_ns <- head(ns_counts_df, n = 3)
top_ns
## Name Count
## 2 233210 52
## 96 541600 32
## 21 238990 21
#NAICS description counts
nd_counts <- table(asb_demo$`PRIMARY NAICS DESCRIPTION`)
nd_counts_df <- data.frame(Name = names(nd_counts), Count = as.numeric(nd_counts))
nd_counts_df <- nd_counts_df[order(-nd_counts_df$Count), ]
# Top NAICS Description is Single Family Housing Construction
top_nd <- head(nd_counts_df, n = 3)
top_nd
## Name Count
## 110 Single Family Housing Construction (1997 NAICS) 52
## 66 Management, scientific, & technical consulting services 32
## 7 All other specialty trade contractors 21
#zip code counts
zip_counts <- table(asb_demo$`ZIP CODE`)
zip_counts_df <- data.frame(Name = names(zip_counts), Count = as.numeric(zip_counts))
zip_counts_df <- zip_counts_df[order(-zip_counts_df$Count), ]
# 91311 Zip Code has most small businesses based on limited data
top_zips <- head(zip_counts_df, n = 3)
top_zips
## Name Count
## 135 91311 41
## 158 91406 14
## 92 90670 13
zip_biz_count <- sb_demo %>%
group_by(Zip) %>%
summarize(count = n())
zip_biz_count <- zip_biz_count %>%
left_join(demo, by = c("Zip" = "Zip_code"))
zip_biz_count
## # A tibble: 276 × 17
## Zip count est_under_5 est_5_to_9 est_10_to_14 est_15_to_19 est_20_to_24
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 90001 14 0.0837 0.0831 0.0910 0.0765 0.0836
## 2 90002 6 0.0800 0.0866 0.0944 0.0868 0.0920
## 3 90003 6 0.0831 0.0892 0.0948 0.0891 0.0708
## 4 90004 12 0.0613 0.0538 0.0491 0.0465 0.0735
## 5 90005 8 0.0535 0.0538 0.0501 0.0426 0.0670
## 6 90006 19 0.0513 0.0688 0.0590 0.0626 0.0746
## 7 90007 13 0.0277 0.0365 0.0399 0.119 0.282
## 8 90008 13 0.0452 0.0355 0.0629 0.0475 0.0590
## 9 90010 33 0.0166 0.0579 0 0.0231 0.0725
## 10 90011 9 0.0727 0.0878 0.0853 0.0815 0.0898
## # ℹ 266 more rows
## # ℹ 10 more variables: est_25_to_34 <dbl>, est_35_to_44 <dbl>,
## # est_45_to_54 <dbl>, est_55_to_59 <dbl>, est_60_to_64 <dbl>,
## # est_65_to_74 <dbl>, est_75_to_84 <dbl>, est_85_over <dbl>,
## # est_median_age <dbl>, est_total_pop <dbl>
correlation <- cor(zip_biz_count$count, zip_biz_count$est_total_pop)
correlation
## [1] 0.1475194
ggplot(zip_biz_count, aes(x = count, y = est_total_pop)) +
geom_point() + # Scatter plot
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Trend line
geom_text(aes(x = max(count), y = max(est_total_pop),
label = paste("Correlation:", round(correlation, 3))),
nudge_x = -100, nudge_y = 500, color = "blue") + # Correlation coefficient annotation
labs(title = "Small Business count vs Population(By Zip Code)",
x = "Small Business Count",
y = "Estimated Total Population") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Amount of small businesses created by year
sb_count_year <- sb_demo %>%
filter(!is.na(`Year Established`) & nchar(`Year Established`) >= 4) %>%
group_by(`Year Established`) %>%
summarize(count = n())
ggplot(sb_count_year, aes(x = `Year Established`, y = count)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Small Businesses established per year in the LA county", x = "Year Established", y = "Small Business Count") +
scale_x_continuous(breaks = seq(min(sb_count_year$`Year Established`), max(sb_count_year$`Year Established`), by = 10))