Initial LA County Small Business Cleaning
Install Necessary Packages
install.packages('tidyverse')
## Installing package into 'C:/Users/Eduardo Galvez/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Eduardo Galvez\AppData\Local\Temp\RtmpwDqDle\downloaded_packages
Import Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
Load in datasets
demo_data <- suppressMessages(read_csv('ca_demo_by_zip.csv', skip = 1 , show_col_types = FALSE))
active_biz <- suppressMessages(read_csv('Listing_of_Active_Businesses.csv', show_col_types = FALSE))
#Name and Trade Name of Firm; Contact; Address and City, State Zip; Capabilities Narrative; E-mail Address; Year Established, putting this here for now to showcase
#Updated dataset, this contains manufacturing, services, r&d, and construction
#after looking into the data from the small business database, it seems that there
#is lots of overlap
small_biz <- suppressMessages(read_csv('small_biz_data.txt',show_col_types = FALSE))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
Get summary of datasets
summary(demo_data)
summary(active_biz)
summary(small_biz)
Cleaning the Active Business
Dataset—————————————————————————————-
Clean the Active Business Dataset by extracting zip Code and remove
any null/invalid zip code values
clean_active_biz <- active_biz %>%
mutate(`ZIP CODE` = substr(`ZIP CODE`, 1, 5))
num_nulls <- sum(is.na(clean_active_biz$`ZIP CODE`) | nchar(clean_active_biz$`ZIP CODE`) < 5)
print(num_nulls)
## [1] 327
filter_active_biz <- clean_active_biz %>%
filter(nchar(`ZIP CODE`) == 5)
zip_num_count <- grepl("^\\d+$", filter_active_biz$`ZIP CODE`)
count_table <- table(zip_num_count)
print(count_table)
## zip_num_count
## FALSE TRUE
## 63 574492
zip_only_int <- filter_active_biz[zip_num_count, ]
head(zip_only_int)
## # A tibble: 6 × 16
## `LOCATION ACCOUNT #` `BUSINESS NAME` `DBA NAME` `STREET ADDRESS` CITY
## <chr> <chr> <chr> <chr> <chr>
## 1 0000000150-0001-5 A A OFICINA CENTRAL HI… <NA> 2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2 SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2 AUSTIN B CREEK <NA> 3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1 SOEUNG CHAING CAFECAFE 1330 W PACIFIC … WILM…
## 5 0002728984-0001-8 MARIA ALVAREZ <NA> 9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1 BELINDA MIXON-JOY BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## # `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## # NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## # `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>
Clean Null values within NAICS description column
non_null_rows <- zip_only_int %>%
filter(!is.na(`PRIMARY NAICS DESCRIPTION`))
head(non_null_rows)
## # A tibble: 6 × 16
## `LOCATION ACCOUNT #` `BUSINESS NAME` `DBA NAME` `STREET ADDRESS` CITY
## <chr> <chr> <chr> <chr> <chr>
## 1 0000000150-0001-5 A A OFICINA CENTRAL HI… <NA> 2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2 SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2 AUSTIN B CREEK <NA> 3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1 SOEUNG CHAING CAFECAFE 1330 W PACIFIC … WILM…
## 5 0002728984-0001-8 MARIA ALVAREZ <NA> 9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1 BELINDA MIXON-JOY BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## # `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## # NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## # `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>
Clean Null Business Name
clean_active_biz_name <- non_null_rows %>%
filter(!is.na(`BUSINESS NAME`))
head(clean_active_biz_name)
## # A tibble: 6 × 16
## `LOCATION ACCOUNT #` `BUSINESS NAME` `DBA NAME` `STREET ADDRESS` CITY
## <chr> <chr> <chr> <chr> <chr>
## 1 0000000150-0001-5 A A OFICINA CENTRAL HI… <NA> 2015 W TEMPLE S… LOS …
## 2 0000000156-0001-2 SPRINGBOARD NON-PROFIT… MONEY MAN… 1605 W OLYMPIC … LOS …
## 3 0002842614-0001-2 AUSTIN B CREEK <NA> 3610 W MAGNOLIA… BURB…
## 4 0003176540-0001-1 SOEUNG CHAING CAFECAFE 1330 W PACIFIC … WILM…
## 5 0002728984-0001-8 MARIA ALVAREZ <NA> 9625 SYLMAR AVE… PANO…
## 6 0002973746-0002-1 BELINDA MIXON-JOY BOOM'S RO… 327 CULVER BLVD… PLAY…
## # ℹ 11 more variables: `ZIP CODE` <chr>, `LOCATION DESCRIPTION` <chr>,
## # `MAILING ADDRESS` <chr>, `MAILING CITY` <chr>, `MAILING ZIP CODE` <chr>,
## # NAICS <dbl>, `PRIMARY NAICS DESCRIPTION` <chr>, `COUNCIL DISTRICT` <dbl>,
## # `LOCATION START DATE` <chr>, `LOCATION END DATE` <lgl>, LOCATION <chr>
#luckily there are no null business name values in this column
Drop the location end date, as all the values are null
clean_active_biz_name <- clean_active_biz_name %>% select(-`LOCATION END DATE`)
Clean the Small Business Dataset
——————————————————————————————-
Clean the zip code column
#similar cleaning to the active business dataset
clean_small_biz_zip <- small_biz %>%
mutate(`Zip` = substr(`Zip`, 1, 5))
sb_num_nulls <- sum(is.na(clean_small_biz_zip$`Zip`) | nchar(clean_small_biz_zip$`Zip`) < 5)
print(sb_num_nulls)
## [1] 1
valid_zip <- clean_small_biz_zip %>%
filter(nchar(`Zip`) == 5)
sb_zip_num_count <- grepl("^\\d+$", valid_zip$`Zip`)
sb_count_table <- table(sb_zip_num_count)
print(sb_count_table)
## sb_zip_num_count
## TRUE
## 6769
sb_zip_only_int <- valid_zip[sb_zip_num_count, ]
head(sb_zip_only_int)
## # A tibble: 6 × 13
## View `Name of Firm` `Trade Name` unknown...4 unknown...5 `Address, line 1`
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 Maya Advertising… <NA> Luis Vasqu… PRESIDENT 300 The Village
## 2 2 SIGNATURE OFFICE… <NA> GARY KRIEG… General Ma… 6033 S CENTRAL A…
## 3 3 CHEETAH EXPRESS <NA> LEE ELLIS <NA> 3010 WILSHIRE BL…
## 4 4 SAESHE ADVERTISI… <NA> YOUNG YU VICE PRES 1055 WEST 7TH ST…
## 5 5 ELECTROLIZING INC <NA> JACK MORG… SALES 1947 HOOPER AVE
## 6 6 STEMAR EQUIPMENT… <NA> TIM MARKON… PRESIDENT 353 S CENTRAL AVE
## # ℹ 7 more variables: `Address, line 2` <chr>, City <chr>, State <chr>,
## # Zip <chr>, `Capabilities Narrative` <chr>, `E-mail Address` <chr>,
## # `Year Established` <dbl>
Remove null from: “Name of Firm”, “Capbilities Narrative”
sb_clean <- sb_contact_combine %>%
filter(!is.na(`Name of Firm`))
print(sb_clean)
## # A tibble: 6,706 × 12
## View `Name of Firm` `Trade Name` `Address, line 1` `Address, line 2` City
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 Maya Advertisin… <NA> 300 The Village <NA> Redo…
## 2 2 SIGNATURE OFFIC… <NA> 6033 S CENTRAL A… <NA> LOS …
## 3 3 CHEETAH EXPRESS <NA> 3010 WILSHIRE BL… <NA> LOS …
## 4 4 SAESHE ADVERTIS… <NA> 1055 WEST 7TH ST… <NA> LOS …
## 5 5 ELECTROLIZING I… <NA> 1947 HOOPER AVE <NA> LOS …
## 6 6 STEMAR EQUIPMEN… <NA> 353 S CENTRAL AVE <NA> LOS …
## 7 7 NATURAL BUILDIN… <NA> 16610 S WESTERN … <NA> GARD…
## 8 8 U M S INC UNITED MAIN… 2140 W OLYMPIC B… <NA> LOS …
## 9 9 STOVER SEED COM… <NA> 1415 EAST SIXTH … <NA> LOS …
## 10 10 HOLLYWOOD FILM … <NA> 9265 BORDEN AVE <NA> SUN …
## # ℹ 6,696 more rows
## # ℹ 6 more variables: State <chr>, Zip <chr>, `Capabilities Narrative` <chr>,
## # `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>
sb_clean <- sb_clean %>%
filter(!is.na(`Capabilities Narrative`))
print(sb_clean)
## # A tibble: 6,020 × 12
## View `Name of Firm` `Trade Name` `Address, line 1` `Address, line 2` City
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 Maya Advertisin… <NA> 300 The Village <NA> Redo…
## 2 2 SIGNATURE OFFIC… <NA> 6033 S CENTRAL A… <NA> LOS …
## 3 3 CHEETAH EXPRESS <NA> 3010 WILSHIRE BL… <NA> LOS …
## 4 4 SAESHE ADVERTIS… <NA> 1055 WEST 7TH ST… <NA> LOS …
## 5 5 ELECTROLIZING I… <NA> 1947 HOOPER AVE <NA> LOS …
## 6 6 STEMAR EQUIPMEN… <NA> 353 S CENTRAL AVE <NA> LOS …
## 7 7 NATURAL BUILDIN… <NA> 16610 S WESTERN … <NA> GARD…
## 8 8 U M S INC UNITED MAIN… 2140 W OLYMPIC B… <NA> LOS …
## 9 9 STOVER SEED COM… <NA> 1415 EAST SIXTH … <NA> LOS …
## 10 10 HOLLYWOOD FILM … <NA> 9265 BORDEN AVE <NA> SUN …
## # ℹ 6,010 more rows
## # ℹ 6 more variables: State <chr>, Zip <chr>, `Capabilities Narrative` <chr>,
## # `E-mail Address` <chr>, `Year Established` <dbl>, Contact <chr>
Drop view column from Small Business dataset, essentially a row
numerator
sb_clean <- sb_clean %>% select(-`View`)
Clean California Demographics
Dataset—————————————————————————————
Keep relevant columns and rename
demo_data <- demo_data %>%
select(
"Geographic Area Name",
"Estimate!!SEX AND AGE!!Total population!!Under 5 years",
"Estimate!!SEX AND AGE!!Total population!!5 to 9 years",
"Estimate!!SEX AND AGE!!Total population!!10 to 14 years",
"Estimate!!SEX AND AGE!!Total population!!15 to 19 years",
"Estimate!!SEX AND AGE!!Total population!!20 to 24 years",
"Estimate!!SEX AND AGE!!Total population!!25 to 34 years",
"Estimate!!SEX AND AGE!!Total population!!35 to 44 years",
"Estimate!!SEX AND AGE!!Total population!!45 to 54 years",
"Estimate!!SEX AND AGE!!Total population!!55 to 59 years",
"Estimate!!SEX AND AGE!!Total population!!60 to 64 years",
"Estimate!!SEX AND AGE!!Total population!!65 to 74 years",
"Estimate!!SEX AND AGE!!Total population!!75 to 84 years",
"Estimate!!SEX AND AGE!!Total population!!85 years and over",
"Estimate!!SEX AND AGE!!Total population!!Median age (years)"
) %>%
rename(
"Zip_code" = "Geographic Area Name",
"est_under_5" = "Estimate!!SEX AND AGE!!Total population!!Under 5 years",
"est_5_to_9" = "Estimate!!SEX AND AGE!!Total population!!5 to 9 years",
"est_10_to_14" = "Estimate!!SEX AND AGE!!Total population!!10 to 14 years",
"est_15_to_19" = "Estimate!!SEX AND AGE!!Total population!!15 to 19 years",
"est_20_to_24" = "Estimate!!SEX AND AGE!!Total population!!20 to 24 years",
"est_25_to_34" = "Estimate!!SEX AND AGE!!Total population!!25 to 34 years",
"est_35_to_44" = "Estimate!!SEX AND AGE!!Total population!!35 to 44 years",
"est_45_to_54" = "Estimate!!SEX AND AGE!!Total population!!45 to 54 years",
"est_55_to_59" = "Estimate!!SEX AND AGE!!Total population!!55 to 59 years",
"est_60_to_64" = "Estimate!!SEX AND AGE!!Total population!!60 to 64 years",
"est_65_to_74" = "Estimate!!SEX AND AGE!!Total population!!65 to 74 years",
"est_75_to_84" = "Estimate!!SEX AND AGE!!Total population!!75 to 84 years",
"est_85_over" = "Estimate!!SEX AND AGE!!Total population!!85 years and over",
"est_median_age" = "Estimate!!SEX AND AGE!!Total population!!Median age (years)"
)
colnames(demo_data)
## [1] "Zip_code" "est_under_5" "est_5_to_9" "est_10_to_14"
## [5] "est_15_to_19" "est_20_to_24" "est_25_to_34" "est_35_to_44"
## [9] "est_45_to_54" "est_55_to_59" "est_60_to_64" "est_65_to_74"
## [13] "est_75_to_84" "est_85_over" "est_median_age"
Clean zip code column
#get the last 5 chars to get the zip code
demo_data <- demo_data %>%
mutate(`Zip_code` = substr(`Zip_code`, nchar(`Zip_code`) - 4, nchar(`Zip_code`)))
summary(demo_data)
## Zip_code est_under_5 est_5_to_9 est_10_to_14
## Length:1769 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 59 1st Qu.: 71 1st Qu.: 73
## Mode :character Median : 833 Median : 868 Median : 877
## Mean :1362 Mean :1375 Mean :1469
## 3rd Qu.:2215 3rd Qu.:2255 3rd Qu.:2348
## Max. :8224 Max. :9724 Max. :9449
## est_15_to_19 est_20_to_24 est_25_to_34 est_35_to_44
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 65 1st Qu.: 147 1st Qu.: 165
## Median : 854 Median : 880 Median : 2014 Median : 1968
## Mean : 1440 Mean : 1522 Mean : 3396 Mean : 2959
## 3rd Qu.: 2296 3rd Qu.: 2341 3rd Qu.: 5630 3rd Qu.: 4960
## Max. :12266 Max. :16890 Max. :18544 Max. :15930
## est_45_to_54 est_55_to_59 est_60_to_64 est_65_to_74 est_75_to_84
## Min. : 0 Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 178 1st Qu.: 103 1st Qu.: 114 1st Qu.: 187 1st Qu.: 81.0
## Median : 2022 Median :1030 Median : 992 Median :1412 Median : 653.0
## Mean : 2850 Mean :1406 Mean :1276 Mean :1851 Mean : 910.9
## 3rd Qu.: 4717 3rd Qu.:2356 3rd Qu.:2162 3rd Qu.:3066 3rd Qu.:1509.0
## Max. :13794 Max. :6993 Max. :6315 Max. :8537 Max. :5172.0
## est_85_over est_median_age
## Min. : 0.0 Length:1769
## 1st Qu.: 24.0 Class :character
## Median : 264.0 Mode :character
## Mean : 432.5
## 3rd Qu.: 741.0
## Max. :3169.0
#make sure there are no null or zip codes with length less than 5
demo_zip_nulls <- sum(is.na(demo_data$`Zip_code`) | nchar(demo_data$`Zip_code`) < 5)
print(num_nulls)
## [1] 327
demo_data <- demo_data %>%
filter(nchar(`Zip_code`) == 5)
demo_zip_num_count <- grepl("^\\d+$", demo_data$`Zip_code`)
count_table <- table(demo_zip_num_count)
print(count_table)
## demo_zip_num_count
## TRUE
## 1769
demo_data <- demo_data[demo_zip_num_count, ]
summary(demo_data)
## Zip_code est_under_5 est_5_to_9 est_10_to_14
## Length:1769 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 59 1st Qu.: 71 1st Qu.: 73
## Mode :character Median : 833 Median : 868 Median : 877
## Mean :1362 Mean :1375 Mean :1469
## 3rd Qu.:2215 3rd Qu.:2255 3rd Qu.:2348
## Max. :8224 Max. :9724 Max. :9449
## est_15_to_19 est_20_to_24 est_25_to_34 est_35_to_44
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 65 1st Qu.: 147 1st Qu.: 165
## Median : 854 Median : 880 Median : 2014 Median : 1968
## Mean : 1440 Mean : 1522 Mean : 3396 Mean : 2959
## 3rd Qu.: 2296 3rd Qu.: 2341 3rd Qu.: 5630 3rd Qu.: 4960
## Max. :12266 Max. :16890 Max. :18544 Max. :15930
## est_45_to_54 est_55_to_59 est_60_to_64 est_65_to_74 est_75_to_84
## Min. : 0 Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 178 1st Qu.: 103 1st Qu.: 114 1st Qu.: 187 1st Qu.: 81.0
## Median : 2022 Median :1030 Median : 992 Median :1412 Median : 653.0
## Mean : 2850 Mean :1406 Mean :1276 Mean :1851 Mean : 910.9
## 3rd Qu.: 4717 3rd Qu.:2356 3rd Qu.:2162 3rd Qu.:3066 3rd Qu.:1509.0
## Max. :13794 Max. :6993 Max. :6315 Max. :8537 Max. :5172.0
## est_85_over est_median_age
## Min. : 0.0 Length:1769
## 1st Qu.: 24.0 Class :character
## Median : 264.0 Mode :character
## Mean : 432.5
## 3rd Qu.: 741.0
## Max. :3169.0
Remove any nulls present in Demographic data rows
summary(demo_data)
## Zip_code est_under_5 est_5_to_9 est_10_to_14
## Length:1769 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 59 1st Qu.: 71 1st Qu.: 73
## Mode :character Median : 833 Median : 868 Median : 877
## Mean :1362 Mean :1375 Mean :1469
## 3rd Qu.:2215 3rd Qu.:2255 3rd Qu.:2348
## Max. :8224 Max. :9724 Max. :9449
## est_15_to_19 est_20_to_24 est_25_to_34 est_35_to_44
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 65 1st Qu.: 147 1st Qu.: 165
## Median : 854 Median : 880 Median : 2014 Median : 1968
## Mean : 1440 Mean : 1522 Mean : 3396 Mean : 2959
## 3rd Qu.: 2296 3rd Qu.: 2341 3rd Qu.: 5630 3rd Qu.: 4960
## Max. :12266 Max. :16890 Max. :18544 Max. :15930
## est_45_to_54 est_55_to_59 est_60_to_64 est_65_to_74 est_75_to_84
## Min. : 0 Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 178 1st Qu.: 103 1st Qu.: 114 1st Qu.: 187 1st Qu.: 81.0
## Median : 2022 Median :1030 Median : 992 Median :1412 Median : 653.0
## Mean : 2850 Mean :1406 Mean :1276 Mean :1851 Mean : 910.9
## 3rd Qu.: 4717 3rd Qu.:2356 3rd Qu.:2162 3rd Qu.:3066 3rd Qu.:1509.0
## Max. :13794 Max. :6993 Max. :6315 Max. :8537 Max. :5172.0
## est_85_over est_median_age
## Min. : 0.0 Length:1769
## 1st Qu.: 24.0 Class :character
## Median : 264.0 Mode :character
## Mean : 432.5
## 3rd Qu.: 741.0
## Max. :3169.0
demo_data <- na.omit(demo_data)
summary(demo_data)
## Zip_code est_under_5 est_5_to_9 est_10_to_14
## Length:1769 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 59 1st Qu.: 71 1st Qu.: 73
## Mode :character Median : 833 Median : 868 Median : 877
## Mean :1362 Mean :1375 Mean :1469
## 3rd Qu.:2215 3rd Qu.:2255 3rd Qu.:2348
## Max. :8224 Max. :9724 Max. :9449
## est_15_to_19 est_20_to_24 est_25_to_34 est_35_to_44
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 65 1st Qu.: 147 1st Qu.: 165
## Median : 854 Median : 880 Median : 2014 Median : 1968
## Mean : 1440 Mean : 1522 Mean : 3396 Mean : 2959
## 3rd Qu.: 2296 3rd Qu.: 2341 3rd Qu.: 5630 3rd Qu.: 4960
## Max. :12266 Max. :16890 Max. :18544 Max. :15930
## est_45_to_54 est_55_to_59 est_60_to_64 est_65_to_74 est_75_to_84
## Min. : 0 Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 178 1st Qu.: 103 1st Qu.: 114 1st Qu.: 187 1st Qu.: 81.0
## Median : 2022 Median :1030 Median : 992 Median :1412 Median : 653.0
## Mean : 2850 Mean :1406 Mean :1276 Mean :1851 Mean : 910.9
## 3rd Qu.: 4717 3rd Qu.:2356 3rd Qu.:2162 3rd Qu.:3066 3rd Qu.:1509.0
## Max. :13794 Max. :6993 Max. :6315 Max. :8537 Max. :5172.0
## est_85_over est_median_age
## Min. : 0.0 Length:1769
## 1st Qu.: 24.0 Class :character
## Median : 264.0 Mode :character
## Mean : 432.5
## 3rd Qu.: 741.0
## Max. :3169.0
Check for duplicates
dupe_demo <- duplicated(demo_data)
a <- sum(dupe_demo)
dupe_sb_clean <- duplicated(sb_clean)
b <- sum(dupe_sb_clean)
dupe_active_biz <- duplicated(clean_active_biz_name)
c <- sum(dupe_active_biz)
paste("Number of duplicate rows in demo data:", a)
## [1] "Number of duplicate rows in demo data: 0"
paste("Number of duplicate rows in small business data:", b)
## [1] "Number of duplicate rows in small business data: 0"
paste("Number of duplicate rows in active business data:", c)
## [1] "Number of duplicate rows in active business data: 0"
Creating a report for each of the datasets to check integrity
summary(demo_data)
## Zip_code est_under_5 est_5_to_9 est_10_to_14
## Length:1769 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 59 1st Qu.: 71 1st Qu.: 73
## Mode :character Median : 833 Median : 868 Median : 877
## Mean :1362 Mean :1375 Mean :1469
## 3rd Qu.:2215 3rd Qu.:2255 3rd Qu.:2348
## Max. :8224 Max. :9724 Max. :9449
## est_15_to_19 est_20_to_24 est_25_to_34 est_35_to_44
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 65 1st Qu.: 147 1st Qu.: 165
## Median : 854 Median : 880 Median : 2014 Median : 1968
## Mean : 1440 Mean : 1522 Mean : 3396 Mean : 2959
## 3rd Qu.: 2296 3rd Qu.: 2341 3rd Qu.: 5630 3rd Qu.: 4960
## Max. :12266 Max. :16890 Max. :18544 Max. :15930
## est_45_to_54 est_55_to_59 est_60_to_64 est_65_to_74 est_75_to_84
## Min. : 0 Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 178 1st Qu.: 103 1st Qu.: 114 1st Qu.: 187 1st Qu.: 81.0
## Median : 2022 Median :1030 Median : 992 Median :1412 Median : 653.0
## Mean : 2850 Mean :1406 Mean :1276 Mean :1851 Mean : 910.9
## 3rd Qu.: 4717 3rd Qu.:2356 3rd Qu.:2162 3rd Qu.:3066 3rd Qu.:1509.0
## Max. :13794 Max. :6993 Max. :6315 Max. :8537 Max. :5172.0
## est_85_over est_median_age
## Min. : 0.0 Length:1769
## 1st Qu.: 24.0 Class :character
## Median : 264.0 Mode :character
## Mean : 432.5
## 3rd Qu.: 741.0
## Max. :3169.0
summary(sb_clean)
## Name of Firm Trade Name Address, line 1 Address, line 2
## Length:6020 Length:6020 Length:6020 Length:6020
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## City State Zip
## Length:6020 Length:6020 Length:6020
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Capabilities Narrative E-mail Address Year Established Contact
## Length:6020 Length:6020 Min. : 200 Length:6020
## Class :character Class :character 1st Qu.:1991 Class :character
## Mode :character Mode :character Median :2003 Mode :character
## Mean :1998
## 3rd Qu.:2010
## Max. :2023
## NA's :3
summary(clean_active_biz_name)
## LOCATION ACCOUNT # BUSINESS NAME DBA NAME STREET ADDRESS
## Length:511745 Length:511745 Length:511745 Length:511745
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## CITY ZIP CODE LOCATION DESCRIPTION MAILING ADDRESS
## Length:511745 Length:511745 Length:511745 Length:511745
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## MAILING CITY MAILING ZIP CODE NAICS
## Length:511745 Length:511745 Min. :115210
## Class :character Class :character 1st Qu.:452000
## Mode :character Mode :character Median :541213
## Mean :548421
## 3rd Qu.:624200
## Max. :999999
## PRIMARY NAICS DESCRIPTION COUNCIL DISTRICT LOCATION START DATE
## Length:511745 Min. : 0.000 Length:511745
## Class :character 1st Qu.: 2.000 Class :character
## Mode :character Median : 5.000 Mode :character
## Mean : 6.425
## 3rd Qu.:11.000
## Max. :15.000
## LOCATION
## Length:511745
## Class :character
## Mode :character
##
##
##
Save the datasets to csv for the analysis step
write.csv(demo_data, file = "lac_demographics.csv", row.names = FALSE)
write.csv(sb_clean, file = "lac_small_business.csv", row.names = FALSE)
write.csv(clean_active_biz_name, file = "lac_active_business.csv", row.names = FALSE)