library(sergeant)
This analysis replicates https://drill.apache.org/docs/analyzing-the-yelp-academic-dataset/ using (mostly) just dplyr
calls.
It assumes you have installed Drill per https://drill.apache.org/docs/drill-in-10-minutes/ and have grabbed the Yelp data set per https://www.yelp.com/dataset_challenge.
NOTE that as of the date on this Rmd, the Yelp analysis on the Drill site won’t work there as the Yelp JSON structure changed. This Rmd used Yelp data downloaded on the day of the date in the header.
The local name of my Drill server is bigd
(don’t judge) so you’ll have to make adjustments for that. Likely localhost
for many folks.
db <- src_drill("bigd")
yelp_biz <- tbl(db, "dfs.d.`/yelp/yelp_academic_dataset_business.json`")
yelp_biz
## # Source: table<dfs.d.`/yelp/yelp_academic_dataset_business.json`> [?? x
## # 16]
## # Database: DrillConnection
## hours
## <chr>
## 1 "[\"Monday 11:0-21:0\",\"Tuesday 11:0-21:0\",\"Wednesday 11:0-21:0\",\"Thur
## 2 "[\"Monday 0:0-0:0\",\"Tuesday 0:0-0:0\",\"Wednesday 0:0-0:0\",\"Thursday 0
## 3 "[\"Monday 11:0-2:0\",\"Tuesday 11:0-2:0\",\"Wednesday 11:0-2:0\",\"Thursda
## 4 "[\"Tuesday 10:0-21:0\",\"Wednesday 10:0-21:0\",\"Thursday 10:0-21:0\",\"Fr
## 5 []
## 6 "[\"Monday 10:30-20:0\",\"Tuesday 10:30-20:0\",\"Thursday 10:30-20:0\",\"Fr
## 7 "[\"Monday 10:0-18:0\",\"Tuesday 10:0-18:0\",\"Wednesday 10:0-18:0\",\"Thur
## 8 "[\"Monday 10:0-22:0\",\"Tuesday 10:0-22:0\",\"Wednesday 10:0-22:0\",\"Thur
## 9 "[\"Monday 9:30-18:0\",\"Tuesday 9:30-18:0\",\"Wednesday 9:30-18:0\",\"Thur
## 10 []
## # ... with more rows, and 15 more variables: address <chr>, city <chr>,
## # is_open <int>, latitude <dbl>, review_count <int>, stars <dbl>,
## # type <chr>, name <chr>, attributes <chr>, neighborhood <chr>,
## # state <chr>, categories <chr>, postal_code <chr>, business_id <chr>,
## # longitude <dbl>
glimpse(yelp_biz)
## Observations: 25
## Variables: 16
## $ hours <chr> "[\"Monday 11:0-21:0\",\"Tuesday 11:0-21:0\",\"We...
## $ address <chr> "227 E Baseline Rd, Ste J2", "495 S Grand Central...
## $ city <chr> "Tempe", "Las Vegas", "Toronto", "Oakdale", "Toro...
## $ is_open <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1...
## $ latitude <dbl> 33.37821, 36.19228, 43.66105, 40.44454, 43.65983,...
## $ review_count <int> 17, 9, 7, 4, 8, 3, 8, 9, 11, 3, 7, 38, 4, 4, 3, 3...
## $ stars <dbl> 4.5, 5.0, 2.5, 4.0, 3.0, 2.5, 3.5, 2.5, 4.5, 3.5,...
## $ type <chr> "business", "business", "business", "business", "...
## $ name <chr> "Innovative Vapors", "Cut and Taste", "Pizza Pizz...
## $ attributes <chr> "[\"BikeParking: True\",\"BusinessAcceptsBitcoin:...
## $ neighborhood <chr> NA, NA, "Dufferin Grove", NA, "Downtown Core", NA...
## $ state <chr> "AZ", "NV", "ON", "PA", "ON", "ON", "AZ", "AZ", "...
## $ categories <chr> "[\"Tobacco Shops\",\"Nightlife\",\"Vape Shops\",...
## $ postal_code <chr> "85283", "89106", "M6H 1L5", "15071", "M5B 2C2", ...
## $ business_id <chr> "0DI8Dt2PJp07XkVvIElIcQ", "LTlCaCGZE14GuaUXUGbamg...
## $ longitude <dbl> -111.93610, -115.15927, -79.42909, -80.17454, -79...
count(yelp_biz, wt=review_count)
## # Source: lazy query [?? x 1]
## # Database: DrillConnection
## n
## <int>
## 1 4152949
count(yelp_biz, state, city, wt=review_count, sort=TRUE)
## # Source: lazy query [?? x 3]
## # Database: DrillConnection
## # Groups: state
## # Ordered by: desc(n)
## city state n
## <chr> <chr> <int>
## 1 Las Vegas NV 1295478
## 2 Phoenix AZ 456673
## 3 Toronto ON 349695
## 4 Scottsdale AZ 245105
## 5 Charlotte NC 183463
## 6 Pittsburgh PA 143116
## 7 Tempe AZ 129786
## 8 Henderson NV 126750
## 9 Mesa AZ 97842
## 10 Montréal QC 96510
## # ... with more rows
group_by(yelp_biz, stars) %>%
summarise(reviews_avg = trunc(avg(review_count), 0L)) %>%
arrange(desc(stars)) %>%
collect() %>%
select(stars, reviews_avg)
## # A tibble: 9 x 2
## stars reviews_avg
## * <dbl> <dbl>
## 1 5.0 11
## 2 4.5 34
## 3 4.0 45
## 4 3.5 36
## 5 3.0 26
## 6 2.5 18
## 7 2.0 12
## 8 1.5 12
## 9 1.0 6
filter(yelp_biz, review_count > 1000) %>%
select(name, state, city, review_count) %>%
arrange(desc(review_count)) %>%
collect() %>%
select(name, state, city, review_count) %>%
head(10)
## # A tibble: 10 x 4
## name state city review_count
## <chr> <chr> <chr> <int>
## 1 Mon Ami Gabi NV Las Vegas 6414
## 2 Bacchanal Buffet NV Las Vegas 5715
## 3 Wicked Spoon NV Las Vegas 5216
## 4 Gordon Ramsay BurGR NV Las Vegas 5116
## 5 Earl of Sandwich NV Las Vegas 4655
## 6 Gangnam Asian BBQ Dining NV Las Vegas 4120
## 7 Serendipity 3 NV Las Vegas 3911
## 8 Hash House A Go Go NV Las Vegas 3881
## 9 The Buffet NV Las Vegas 3676
## 10 The Buffet at Bellagio NV Las Vegas 3481
NOTE: We need to delve into Drill SQL b/c we need to target nested JSON info directly
tbl(db, "(SELECT b.name AS name, b.hours[5] AS sat_hrs FROM dfs.d.`/yelp/yelp_academic_dataset_business.json` b)") %>%
filter(!is.na(sat_hrs)) %>%
filter(grepl("Saturday", sat_hrs)) %>%
select(name, sat_hrs) %>%
collect() %>%
mutate(sat_hrs = gsub("Saturday ", "", sat_hrs)) %>%
tidyr::separate(sat_hrs, c("open", "close"), "-") %>%
filter(open != "0:0" & close != "0:0")
## # A tibble: 70,282 x 3
## open close name
## <chr> <chr> <chr>
## 1 10:0 22:0 Innovative Vapors
## 2 11:0 3:0 Pizza Pizza
## 3 10:0 18:0 Boomerang Baby
## 4 10:0 22:0 Taco Bell
## 5 8:30 17:0 CubeSmart Self Storage
## 6 8:0 17:0 Revv Illusions
## 7 11:0 21:30 Ohana Hawaiian BBQ
## 8 11:0 19:0 Encore Books and Records
## 9 10:0 22:0 Fossil Store
## 10 10:0 21:0 Best Buy
## # ... with 70,272 more rows
filter(yelp_biz, repeated_contains(categories, 'Restaurants')) %>%
count()
## # Source: lazy query [?? x 1]
## # Database: DrillConnection
## n
## <int>
## 1 48485
filter(yelp_biz, repeated_contains(categories, 'Restaurants')) %>%
arrange(desc(review_count)) %>%
select(name, state, city, review_count)
## # Source: lazy query [?? x 4]
## # Database: DrillConnection
## # Ordered by: desc(review_count)
## city name review_count state
## <chr> <chr> <int> <chr>
## 1 Las Vegas Mon Ami Gabi 6414 NV
## 2 Las Vegas Bacchanal Buffet 5715 NV
## 3 Las Vegas Wicked Spoon 5216 NV
## 4 Las Vegas Gordon Ramsay BurGR 5116 NV
## 5 Las Vegas Earl of Sandwich 4655 NV
## 6 Las Vegas Gangnam Asian BBQ Dining 4120 NV
## 7 Las Vegas Serendipity 3 3911 NV
## 8 Las Vegas Hash House A Go Go 3881 NV
## 9 Las Vegas The Buffet 3676 NV
## 10 Las Vegas The Buffet at Bellagio 3481 NV
## # ... with more rows
filter(yelp_biz, repeated_contains(categories, 'Restaurants')) %>%
mutate(category_count = repeated_count(categories)) %>%
arrange(desc(category_count)) %>%
select(name, category_count, categories)
## # Source: lazy query [?? x 3]
## # Database: DrillConnection
## # Ordered by: desc(category_count)
## category_count name
## <int> <chr>
## 1 23 Best Of The Best DJ's
## 2 18 Kale Personal Chef Services
## 3 17 EATT Healthy Food
## 4 17 The Bier Markt
## 5 16 3 Brothers Pizza
## 6 16 DeSoto Central Market
## 7 15 Cabin Fever
## 8 15 Golden Bar & Restaurant Equipment
## 9 15 Yard House Red Rock Resort
## 10 15 The Monarch Tavern
## # ... with more rows, and 1 more variables: categories <chr>
NOTE: We need to delve into Drill SQL b/c we need to target nested JSON info directly
tbl(db, "(SELECT categories[0] AS first_cat, COUNT(categories[0]) AS category_count FROM dfs.d.`/yelp/yelp_academic_dataset_business.json` GROUP BY categories[0] ORDER BY COUNT(categories[0]) DESC)")
## # Source: table<(SELECT categories[0] AS first_cat, COUNT(categories[0])
## # AS category_count FROM
## # dfs.d.`/yelp/yelp_academic_dataset_business.json` GROUP BY
## # categories[0] ORDER BY COUNT(categories[0]) DESC)> [?? x 2]
## # Database: DrillConnection
## first_cat category_count
## <chr> <int>
## 1 Restaurants 16632
## 2 Food 6538
## 3 Shopping 6257
## 4 Beauty & Spas 4746
## 5 Home Services 3533
## 6 Health & Medical 3083
## 7 Automotive 3045
## 8 Local Services 2551
## 9 Nightlife 2353
## 10 Active Life 1917
## # ... with more rows
mutate(yelp_biz, category = flatten(categories)) %>%
select(name, category)
## # Source: lazy query [?? x 2]
## # Database: DrillConnection
## name category
## <chr> <chr>
## 1 Innovative Vapors Tobacco Shops
## 2 Innovative Vapors Nightlife
## 3 Innovative Vapors Vape Shops
## 4 Innovative Vapors Shopping
## 5 Cut and Taste Caterers
## 6 Cut and Taste Grocery
## 7 Cut and Taste Food
## 8 Cut and Taste Event Planning & Services
## 9 Cut and Taste Party & Event Planning
## 10 Cut and Taste Specialty Food
## # ... with more rows
mutate(yelp_biz, category = flatten(categories)) %>%
count(category, sort=TRUE)
## # Source: lazy query [?? x 2]
## # Database: DrillConnection
## # Ordered by: desc(n)
## category n
## <chr> <int>
## 1 Restaurants 48485
## 2 Shopping 22466
## 3 Food 21189
## 4 Beauty & Spas 13711
## 5 Home Services 11241
## 6 Nightlife 10524
## 7 Health & Medical 10476
## 8 Bars 9087
## 9 Automotive 8554
## 10 Local Services 8133
## # ... with more rows
yelp_rev <- tbl(db, "dfs.d.`/yelp/yelp_academic_dataset_review.json`")
yelp_rev
## # Source: table<dfs.d.`/yelp/yelp_academic_dataset_review.json`> [?? x
## # 10]
## # Database: DrillConnection
## date review_id user_id cool stars
## <date> <chr> <chr> <int> <int>
## 1 2011-10-10 NxL8SIC5yqOdnlXCg18IBg KpkOkG6RIf4Ra25Lhhxf1A 0 5
## 2 2010-12-29 pXbbIgOXvLuTi_SPs1hQEQ bQ7fQq1otn9hKX-gXRsrgA 0 5
## 3 2011-04-29 wslW2Lu4NYylb1jEapAGsw r1NUhdNmL6yU9Bn-Yx6FTw 0 5
## 4 2014-07-14 GP6YEearUWrzPtQYSF1vVg aW3ix1KNZAvoM8q-WghA3Q 1 5
## 5 2014-01-15 25RlYGq2s5qShi-pn3ufVA YOo-Cip8HqvKp_p9nEGphw 0 4
## 6 2013-04-28 Uf1Ki1yyH_JDKhLvn2e4FQ bgl3j8yJcRO-00NkUYsXGQ 1 5
## 7 2014-10-12 oFmVZh-La7SuvpHrH_Al4Q CWKF9de-nskLYEqDDCfubg 0 4
## 8 2012-09-18 bRvdVt88MJ_YMTlLbjDLxQ GJ7PTY7huYORFKKg3db3Gw 0 5
## 9 2015-10-11 zNUSxqflZKgKD1NQH3jdFA rxqp9eXZj1jYTn0UIsm3Hg 0 5
## 10 2015-04-05 LkP1l7sZIwOV6IKNLqQp_A UU0nHQtHPMAfLidk8tOHTg 0 5
## # ... with more rows, and 5 more variables: text <chr>, type <chr>,
## # business_id <chr>, useful <int>, funny <int>
glimpse(yelp_rev)
## Observations: 25
## Variables: 10
## $ date <date> 2011-10-10, 2010-12-29, 2011-04-29, 2014-07-14, 2...
## $ review_id <chr> "NxL8SIC5yqOdnlXCg18IBg", "pXbbIgOXvLuTi_SPs1hQEQ"...
## $ user_id <chr> "KpkOkG6RIf4Ra25Lhhxf1A", "bQ7fQq1otn9hKX-gXRsrgA"...
## $ cool <int> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0,...
## $ stars <int> 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 1, 5, 4, 5, 5, 1, 4,...
## $ text <chr> "If you enjoy service by someone who is as compete...
## $ type <chr> "review", "review", "review", "review", "review", ...
## $ business_id <chr> "2aFiy99vNLklCx3T_tGS9A", "2aFiy99vNLklCx3T_tGS9A"...
## $ useful <int> 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 0, 4, 0,...
## $ funny <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0,...
count(yelp_rev, business_id, wt=cool, sort=TRUE) %>%
filter(n > 2000) %>%
left_join(select(yelp_biz, business_id, name)) %>%
select(name, n)
## # Source: lazy query [?? x 2]
## # Database: DrillConnection
## # Ordered by: desc(n)
## name n
## <chr> <int>
## 1 Bacchanal Buffet 4789
## 2 Amy's Baking Company 4510
## 3 Wicked Spoon 3918
## 4 Earl of Sandwich 3447
## 5 Secret Pizza 3344
## 6 The Cosmopolitan of Las Vegas 3324
## 7 XS Nightclub 3136
## 8 Mon Ami Gabi 3130
## 9 Gordon Ramsay BurGR 3111
## 10 McCarran International Airport 2999
## # ... with more rows
Analyzing the Yelp Academic Dataset w/Drill & sergeant by Bob Rudis is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
Based on a work at https://drill.apache.org/docs/analyzing-the-yelp-academic-dataset/.