import numpy as np
import scipy.stats as spst
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import duckdb
import plotly.express as px
import warnings
<h1> Statistical Analysis of Yelp Dataset to Understand Restaurant Performance
</h1>
Akash Singha - FT251009|
Aditya Gulati - FT251007|
Rohit Chowdhury - FT251065|
Pragati Sharma - FT251053|
Subhashish Mohan Kar - FT251092|
<br>
Section 1 | Group - 18
<h3>Setup of DataBase
</h3>
= duckdb.connect("yelp.db") ##connecting to the yelp database con
"ignore") warnings.filterwarnings(
Importing the excel files into different SQL tables
"""
con.sql( CREATE TABLE IF NOT EXISTS restos AS
FROM read_csv('yelp.restaurants_train.csv', sample_size = -1);
""")
"""
con.sql( CREATE TABLE IF NOT EXISTS resto_reviews AS
FROM read_csv('yelp.restaurant_reviews.csv', sample_size = -1);
""")
"""
con.sql( CREATE TABLE IF NOT EXISTS users AS
FROM read_csv('yelp.user.csv', sample_size = -1);
""")
"""
con.sql( CREATE TABLE IF NOT EXISTS user_friends AS
FROM read_csv('yelp.user_friends_full.csv', sample_size = -1);
""")
"""
con.sql( CREATE TABLE IF NOT EXISTS resto_test AS
FROM read_csv('yelp.restaurants_test.csv', sample_size = -1);
""")
<h2>Exploring the tables and their datatypes:
</h2>
Restaurants_train table
= con.execute("SELECT COUNT(*) FROM restos").fetchone()
result = result[0]
row_count row_count
48800
"""
con.sql( SELECT * FROM restos
LIMIT 5;
""")
┌──────────────────────┬─────────────────────┬───────────┬───┬──────────────────────┬──────────────────────┐
│ name │ address │ city │ … │ attributes.Accepts… │ attributes.HairSpe… │
│ varchar │ varchar │ varchar │ │ boolean │ varchar │
├──────────────────────┼─────────────────────┼───────────┼───┼──────────────────────┼──────────────────────┤
│ Oskar Blues Taproom │ 921 Pearl St │ Boulder │ … │ NULL │ NULL │
│ Flying Elephants a… │ 7000 NE Airport Way │ Portland │ … │ NULL │ NULL │
│ Bob Likes Thai Food │ 3755 Main St │ Vancouver │ … │ NULL │ NULL │
│ Boxwood Biscuit │ 740 S High St │ Columbus │ … │ NULL │ NULL │
│ Mr G's Pizza & Subs │ 474 Lowell St │ Peabody │ … │ NULL │ NULL │
├──────────────────────┴─────────────────────┴───────────┴───┴──────────────────────┴──────────────────────┤
│ 5 rows 60 columns (5 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────┘
Converting the table to a dataframe:
= con.sql("""
df_restos SELECT * FROM restos
LIMIT 5;
""").df()
df_restos
name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | ... | attributes.Smoking | attributes.DriveThru | attributes.BYOBCorkage | attributes.Corkage | attributes.RestaurantsCounterService | attributes.DietaryRestrictions | attributes.AgesAllowed | attributes.Open24Hours | attributes.AcceptsInsurance | attributes.HairSpecializesIn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Oskar Blues Taproom | 921 Pearl St | Boulder | CO | 80302 | 40.017544 | -105.283348 | 4.0 | 86 | 1 | ... | None | None | None | None | NaN | None | None | NaN | NaN | None |
1 | Flying Elephants at PDX | 7000 NE Airport Way | Portland | OR | 97218 | 45.588906 | -122.593331 | 4.0 | 126 | 1 | ... | None | None | None | None | NaN | None | None | NaN | NaN | None |
2 | Bob Likes Thai Food | 3755 Main St | Vancouver | BC | V5V | 49.251342 | -123.101333 | 3.5 | 169 | 1 | ... | None | None | None | None | NaN | None | None | NaN | NaN | None |
3 | Boxwood Biscuit | 740 S High St | Columbus | OH | 43206 | 39.947007 | -82.997471 | 4.5 | 11 | 1 | ... | None | None | None | None | NaN | None | None | NaN | NaN | None |
4 | Mr G's Pizza & Subs | 474 Lowell St | Peabody | MA | 01960 | 42.541155 | -70.973438 | 4.0 | 39 | 1 | ... | None | None | None | None | NaN | None | None | NaN | NaN | None |
5 rows × 60 columns
Details for all the columns:
"""
con.sql( DESCRIBE restos
""").df()
column_name | column_type | null | key | default | extra | |
---|---|---|---|---|---|---|
0 | name | VARCHAR | YES | None | None | None |
1 | address | VARCHAR | YES | None | None | None |
2 | city | VARCHAR | YES | None | None | None |
3 | state | VARCHAR | YES | None | None | None |
4 | postal_code | VARCHAR | YES | None | None | None |
5 | latitude | DOUBLE | YES | None | None | None |
6 | longitude | DOUBLE | YES | None | None | None |
7 | stars | DOUBLE | YES | None | None | None |
8 | review_count | BIGINT | YES | None | None | None |
9 | is_open | BIGINT | YES | None | None | None |
10 | attributes.RestaurantsTableService | VARCHAR | YES | None | None | None |
11 | attributes.WiFi | VARCHAR | YES | None | None | None |
12 | attributes.BikeParking | VARCHAR | YES | None | None | None |
13 | attributes.BusinessParking | VARCHAR | YES | None | None | None |
14 | attributes.BusinessAcceptsCreditCards | VARCHAR | YES | None | None | None |
15 | attributes.RestaurantsReservations | VARCHAR | YES | None | None | None |
16 | attributes.WheelchairAccessible | VARCHAR | YES | None | None | None |
17 | attributes.Caters | VARCHAR | YES | None | None | None |
18 | attributes.OutdoorSeating | VARCHAR | YES | None | None | None |
19 | attributes.RestaurantsGoodForGroups | VARCHAR | YES | None | None | None |
20 | attributes.HappyHour | VARCHAR | YES | None | None | None |
21 | attributes.BusinessAcceptsBitcoin | BOOLEAN | YES | None | None | None |
22 | attributes.RestaurantsPriceRange2 | VARCHAR | YES | None | None | None |
23 | attributes.Ambience | VARCHAR | YES | None | None | None |
24 | attributes.HasTV | VARCHAR | YES | None | None | None |
25 | attributes.Alcohol | VARCHAR | YES | None | None | None |
26 | attributes.GoodForMeal | VARCHAR | YES | None | None | None |
27 | attributes.DogsAllowed | VARCHAR | YES | None | None | None |
28 | attributes.RestaurantsTakeOut | VARCHAR | YES | None | None | None |
29 | attributes.NoiseLevel | VARCHAR | YES | None | None | None |
30 | attributes.RestaurantsAttire | VARCHAR | YES | None | None | None |
31 | attributes.RestaurantsDelivery | VARCHAR | YES | None | None | None |
32 | categories | VARCHAR | YES | None | None | None |
33 | hours.Monday | VARCHAR | YES | None | None | None |
34 | hours.Tuesday | VARCHAR | YES | None | None | None |
35 | hours.Wednesday | VARCHAR | YES | None | None | None |
36 | hours.Thursday | VARCHAR | YES | None | None | None |
37 | hours.Friday | VARCHAR | YES | None | None | None |
38 | hours.Saturday | VARCHAR | YES | None | None | None |
39 | hours.Sunday | VARCHAR | YES | None | None | None |
40 | int_business_id | BIGINT | YES | None | None | None |
41 | attributes.GoodForKids | VARCHAR | YES | None | None | None |
42 | attributes.ByAppointmentOnly | BOOLEAN | YES | None | None | None |
43 | attributes | VARCHAR | YES | None | None | None |
44 | hours | VARCHAR | YES | None | None | None |
45 | attributes.Music | VARCHAR | YES | None | None | None |
46 | attributes.GoodForDancing | VARCHAR | YES | None | None | None |
47 | attributes.BestNights | VARCHAR | YES | None | None | None |
48 | attributes.BYOB | VARCHAR | YES | None | None | None |
49 | attributes.CoatCheck | VARCHAR | YES | None | None | None |
50 | attributes.Smoking | VARCHAR | YES | None | None | None |
51 | attributes.DriveThru | VARCHAR | YES | None | None | None |
52 | attributes.BYOBCorkage | VARCHAR | YES | None | None | None |
53 | attributes.Corkage | VARCHAR | YES | None | None | None |
54 | attributes.RestaurantsCounterService | BOOLEAN | YES | None | None | None |
55 | attributes.DietaryRestrictions | VARCHAR | YES | None | None | None |
56 | attributes.AgesAllowed | VARCHAR | YES | None | None | None |
57 | attributes.Open24Hours | BOOLEAN | YES | None | None | None |
58 | attributes.AcceptsInsurance | BOOLEAN | YES | None | None | None |
59 | attributes.HairSpecializesIn | VARCHAR | YES | None | None | None |
"""
con.sql( DESCRIBE restos
""")
┌──────────────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├──────────────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ name │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ address │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ city │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ state │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ postal_code │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ latitude │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ longitude │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ stars │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ review_count │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ is_open │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │
│ attributes.Smoking │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.DriveThru │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.BYOBCorkage │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.Corkage │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.RestaurantsCounterService │ BOOLEAN │ YES │ NULL │ NULL │ NULL │
│ attributes.DietaryRestrictions │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.AgesAllowed │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ attributes.Open24Hours │ BOOLEAN │ YES │ NULL │ NULL │ NULL │
│ attributes.AcceptsInsurance │ BOOLEAN │ YES │ NULL │ NULL │ NULL │
│ attributes.HairSpecializesIn │ VARCHAR │ YES │ NULL │ NULL │ NULL │
├──────────────────────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 60 rows (20 shown) 6 columns │
└────────────────────────────────────────────────────────────────────────────────────────────┘
Restaurant Reviews Table:
"""
con.sql( DESCRIBE resto_reviews
""").df()
column_name | column_type | null | key | default | extra | |
---|---|---|---|---|---|---|
0 | stars | BIGINT | YES | None | None | None |
1 | useful | BIGINT | YES | None | None | None |
2 | funny | BIGINT | YES | None | None | None |
3 | cool | BIGINT | YES | None | None | None |
4 | text | VARCHAR | YES | None | None | None |
5 | date | TIMESTAMP | YES | None | None | None |
6 | int_business_id | BIGINT | YES | None | None | None |
7 | int_user_id | BIGINT | YES | None | None | None |
8 | int_rest_review_id | BIGINT | YES | None | None | None |
= con.sql("""
df_restos_reviews SELECT * FROM resto_reviews
LIMIT 5;
""").df()
df_restos_reviews
stars | useful | funny | cool | text | date | int_business_id | int_user_id | int_rest_review_id | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | 1 | 1 | 1 | I've stayed at many Marriott and Renaissance M... | 2010-01-08 02:29:15 | 4954 | 6319642 | 2 |
1 | 2 | 0 | 0 | 0 | The setting is perfectly adequate, and the foo... | 2006-04-16 02:58:44 | 14180 | 292901 | 5 |
2 | 5 | 0 | 0 | 0 | I work in the Pru and this is the most afforda... | 2014-05-07 18:10:21 | 11779 | 6336225 | 7 |
3 | 5 | 5 | 3 | 3 | I loved everything about this place. I've only... | 2014-02-05 21:09:05 | 3216 | 552519 | 12 |
4 | 4 | 0 | 0 | 0 | I think their rice dishes are way better than ... | 2017-05-26 03:05:46 | 8748 | 544027 | 16 |
= con.sql("""
df_restos_reviews SELECT * FROM resto_reviews
LIMIT 5;
""")
df_restos_reviews
┌───────┬────────┬───────┬───────┬───┬─────────────────────┬─────────────────┬─────────────┬────────────────────┐
│ stars │ useful │ funny │ cool │ … │ date │ int_business_id │ int_user_id │ int_rest_review_id │
│ int64 │ int64 │ int64 │ int64 │ │ timestamp │ int64 │ int64 │ int64 │
├───────┼────────┼───────┼───────┼───┼─────────────────────┼─────────────────┼─────────────┼────────────────────┤
│ 2 │ 1 │ 1 │ 1 │ … │ 2010-01-08 02:29:15 │ 4954 │ 6319642 │ 2 │
│ 2 │ 0 │ 0 │ 0 │ … │ 2006-04-16 02:58:44 │ 14180 │ 292901 │ 5 │
│ 5 │ 0 │ 0 │ 0 │ … │ 2014-05-07 18:10:21 │ 11779 │ 6336225 │ 7 │
│ 5 │ 5 │ 3 │ 3 │ … │ 2014-02-05 21:09:05 │ 3216 │ 552519 │ 12 │
│ 4 │ 0 │ 0 │ 0 │ … │ 2017-05-26 03:05:46 │ 8748 │ 544027 │ 16 │
├───────┴────────┴───────┴───────┴───┴─────────────────────┴─────────────────┴─────────────┴────────────────────┤
│ 5 rows 9 columns (8 shown) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
= con.execute("SELECT COUNT(*) FROM resto_reviews").fetchone()
result = result[0]
row_count row_count
1674096
Users Table
= con.execute("SELECT COUNT(*) FROM users").fetchone()
result = result[0]
row_count row_count
2189457
= con.sql("""
df_restos_users SELECT * FROM users
LIMIT 5;
""").df()
'tw2o.xlsx') df_restos_users.to_excel(
"""
con.sql( DESCRIBE users
""").df()
column_name | column_type | null | key | default | extra | |
---|---|---|---|---|---|---|
0 | review_count | BIGINT | YES | None | None | None |
1 | yelping_since | TIMESTAMP | YES | None | None | None |
2 | useful | BIGINT | YES | None | None | None |
3 | funny | BIGINT | YES | None | None | None |
4 | cool | BIGINT | YES | None | None | None |
5 | elite | VARCHAR | YES | None | None | None |
6 | fans | BIGINT | YES | None | None | None |
7 | average_stars | DOUBLE | YES | None | None | None |
8 | compliment_hot | BIGINT | YES | None | None | None |
9 | compliment_more | BIGINT | YES | None | None | None |
10 | compliment_profile | BIGINT | YES | None | None | None |
11 | compliment_cute | BIGINT | YES | None | None | None |
12 | compliment_list | BIGINT | YES | None | None | None |
13 | compliment_note | BIGINT | YES | None | None | None |
14 | compliment_plain | BIGINT | YES | None | None | None |
15 | compliment_cool | BIGINT | YES | None | None | None |
16 | compliment_funny | BIGINT | YES | None | None | None |
17 | compliment_writer | BIGINT | YES | None | None | None |
18 | compliment_photos | BIGINT | YES | None | None | None |
19 | int_user_id | BIGINT | YES | None | None | None |
user_friends tables:
"""
con.sql( DESCRIBE user_friends
""").df()
column_name | column_type | null | key | default | extra | |
---|---|---|---|---|---|---|
0 | int_user_id | BIGINT | YES | None | None | None |
1 | num_friends | BIGINT | YES | None | None | None |
= con.sql("""
df_restos_user_friends SELECT * FROM user_friends
LIMIT 5;
""").df()
df_restos_user_friends
int_user_id | num_friends | |
---|---|---|
0 | 1 | 5813 |
1 | 1248 | 6296 |
2 | 11604 | 835 |
3 | 2278 | 1452 |
4 | 13481 | 532 |
= con.execute("SELECT COUNT(*) FROM user_friends").fetchone()
result = result[0]
row_count row_count
2189457
restaurants_test table:
= con.execute("SELECT COUNT(*) FROM resto_test").fetchone()
result = result[0]
row_count row_count
1963
= con.sql("""
df_resto_test SELECT * FROM resto_test
LIMIT 5;
""").df()
df_resto_test
stars | name | address | city | state | postal_code | latitude | longitude | review_count | is_open | ... | attributes.Smoking | attributes.DriveThru | attributes.BYOBCorkage | attributes.Corkage | attributes.RestaurantsCounterService | attributes.DietaryRestrictions | attributes.AgesAllowed | attributes.Open24Hours | attributes.AcceptsInsurance | attributes.HairSpecializesIn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | 1st Avenue Seafood by Pocha | 901 Lougheed Hwy | Coquitlam | BC | V3K 3T3 | 49.237547 | -122.872628 | 12 | 0 | ... | None | None | None | None | None | None | None | None | None | None |
1 | None | Papi's Cuban & Caribbean Grill | 216 Ponce De Leon Ave NE | Atlanta | GA | 30308 | 33.772758 | -84.380375 | 1001 | 1 | ... | None | None | 'yes_free' | FALSE | None | None | None | None | None | None |
2 | None | D'angelo Sandwich Shop | 1277 Broadway | Saugus | MA | 01906 | 42.461100 | -71.026910 | 6 | 1 | ... | None | None | None | None | None | None | None | None | None | None |
3 | None | Mandarin House | 6263 Roswell Rd NE | Atlanta | GA | 30328 | 33.925975 | -84.378159 | 22 | 0 | ... | None | None | None | None | None | None | None | None | None | None |
4 | None | Jr Crickets | 1197 Peachtree St NE | Atlanta | GA | 30361 | 33.786873 | -84.382572 | 25 | 0 | ... | None | None | None | None | None | None | None | None | None | None |
5 rows × 60 columns
"""
con.sql( DESCRIBE resto_test
""").df()
column_name | column_type | null | key | default | extra | |
---|---|---|---|---|---|---|
0 | stars | VARCHAR | YES | None | None | None |
1 | name | VARCHAR | YES | None | None | None |
2 | address | VARCHAR | YES | None | None | None |
3 | city | VARCHAR | YES | None | None | None |
4 | state | VARCHAR | YES | None | None | None |
5 | postal_code | VARCHAR | YES | None | None | None |
6 | latitude | DOUBLE | YES | None | None | None |
7 | longitude | DOUBLE | YES | None | None | None |
8 | review_count | BIGINT | YES | None | None | None |
9 | is_open | BIGINT | YES | None | None | None |
10 | attributes.RestaurantsTableService | VARCHAR | YES | None | None | None |
11 | attributes.WiFi | VARCHAR | YES | None | None | None |
12 | attributes.BikeParking | BOOLEAN | YES | None | None | None |
13 | attributes.BusinessParking | VARCHAR | YES | None | None | None |
14 | attributes.BusinessAcceptsCreditCards | VARCHAR | YES | None | None | None |
15 | attributes.RestaurantsReservations | VARCHAR | YES | None | None | None |
16 | attributes.WheelchairAccessible | BOOLEAN | YES | None | None | None |
17 | attributes.Caters | VARCHAR | YES | None | None | None |
18 | attributes.OutdoorSeating | VARCHAR | YES | None | None | None |
19 | attributes.RestaurantsGoodForGroups | BOOLEAN | YES | None | None | None |
20 | attributes.HappyHour | BOOLEAN | YES | None | None | None |
21 | attributes.BusinessAcceptsBitcoin | BOOLEAN | YES | None | None | None |
22 | attributes.RestaurantsPriceRange2 | VARCHAR | YES | None | None | None |
23 | attributes.Ambience | VARCHAR | YES | None | None | None |
24 | attributes.HasTV | VARCHAR | YES | None | None | None |
25 | attributes.Alcohol | VARCHAR | YES | None | None | None |
26 | attributes.GoodForMeal | VARCHAR | YES | None | None | None |
27 | attributes.DogsAllowed | BOOLEAN | YES | None | None | None |
28 | attributes.RestaurantsTakeOut | VARCHAR | YES | None | None | None |
29 | attributes.NoiseLevel | VARCHAR | YES | None | None | None |
30 | attributes.RestaurantsAttire | VARCHAR | YES | None | None | None |
31 | attributes.RestaurantsDelivery | VARCHAR | YES | None | None | None |
32 | categories | VARCHAR | YES | None | None | None |
33 | hours.Monday | VARCHAR | YES | None | None | None |
34 | hours.Tuesday | VARCHAR | YES | None | None | None |
35 | hours.Wednesday | VARCHAR | YES | None | None | None |
36 | hours.Thursday | VARCHAR | YES | None | None | None |
37 | hours.Friday | VARCHAR | YES | None | None | None |
38 | hours.Saturday | VARCHAR | YES | None | None | None |
39 | hours.Sunday | VARCHAR | YES | None | None | None |
40 | int_business_id | BIGINT | YES | None | None | None |
41 | attributes.GoodForKids | VARCHAR | YES | None | None | None |
42 | attributes.ByAppointmentOnly | BOOLEAN | YES | None | None | None |
43 | attributes | VARCHAR | YES | None | None | None |
44 | hours | VARCHAR | YES | None | None | None |
45 | attributes.Music | VARCHAR | YES | None | None | None |
46 | attributes.GoodForDancing | VARCHAR | YES | None | None | None |
47 | attributes.BestNights | VARCHAR | YES | None | None | None |
48 | attributes.BYOB | BOOLEAN | YES | None | None | None |
49 | attributes.CoatCheck | BOOLEAN | YES | None | None | None |
50 | attributes.Smoking | VARCHAR | YES | None | None | None |
51 | attributes.DriveThru | VARCHAR | YES | None | None | None |
52 | attributes.BYOBCorkage | VARCHAR | YES | None | None | None |
53 | attributes.Corkage | VARCHAR | YES | None | None | None |
54 | attributes.RestaurantsCounterService | VARCHAR | YES | None | None | None |
55 | attributes.DietaryRestrictions | VARCHAR | YES | None | None | None |
56 | attributes.AgesAllowed | VARCHAR | YES | None | None | None |
57 | attributes.Open24Hours | VARCHAR | YES | None | None | None |
58 | attributes.AcceptsInsurance | VARCHAR | YES | None | None | None |
59 | attributes.HairSpecializesIn | VARCHAR | YES | None | None | None |
Descriptive Analysis
Average stars given to the restaurents
"""
con.sql( SELECT avg(stars) FROM restos
""")
┌───────────────────┐
│ avg(stars) │
│ double │
├───────────────────┤
│ 3.527858606557377 │
└───────────────────┘
= con.sql("""
df_temp SELECT stars FROM restos
""").df()
df_temp
stars | |
---|---|
0 | 4.0 |
1 | 4.0 |
2 | 3.5 |
3 | 4.5 |
4 | 4.0 |
... | ... |
48795 | 2.0 |
48796 | 3.0 |
48797 | 3.0 |
48798 | 4.0 |
48799 | 4.5 |
48800 rows × 1 columns
descriptive statistics of stars column
df_temp.describe()
stars | |
---|---|
count | 48800.000000 |
mean | 3.527859 |
std | 0.773849 |
min | 1.000000 |
25% | 3.000000 |
50% | 3.500000 |
75% | 4.000000 |
max | 5.000000 |
sum().sum() ## if there are null values present or not in the column. df_temp.isnull().
0
histplot of the star column
= df_temp, x = "stars", kind = "hist") sns.displot(data
average stars on yelp recieved by restaurant grouped by states
= con.sql("""
df_temp SELECT state, avg(stars) as avg_stars FROM restos
GROUP BY state
ORDER BY avg_stars DESC
""").df()
df_temp
state | avg_stars | |
---|---|---|
0 | ABE | 4.500000 |
1 | NH | 4.000000 |
2 | OR | 3.752074 |
3 | TX | 3.656262 |
4 | CO | 3.587216 |
5 | FL | 3.521030 |
6 | MN | 3.500000 |
7 | MA | 3.480819 |
8 | BC | 3.454463 |
9 | WA | 3.450202 |
10 | OH | 3.421853 |
11 | GA | 3.399796 |
12 | VA | 2.000000 |
13 | KY | 2.000000 |
14 | KS | 2.000000 |
15 | WY | 1.500000 |
=df_temp
df=(12, 8))
plt.figure(figsize'state'], df['avg_stars'], color='orange')
plt.bar(df['State')
plt.xlabel('Average Stars')
plt.ylabel('Average Star Ratings by State')
plt.title(=45) # Rotate state labels for better readability
plt.xticks(rotation0, 5) # Set y-axis limit to make the plot more readable
plt.ylim( plt.show()
So from this observation we can get a understanding that where high quality restaurants are more in number and where it is rare
.
= con.sql("""SELECT
df1 r.city,
COUNT(rv.int_rest_review_id) AS review_count
FROM
restos r
JOIN
resto_reviews rv
ON
r.int_business_id = rv.int_business_id
GROUP BY
r.city""").df()
df1
city | review_count | |
---|---|---|
0 | East Boston | 655 |
1 | Cocoa | 1483 |
2 | Wellesley | 1857 |
3 | Beverly | 3061 |
4 | Norcross | 425 |
... | ... | ... |
443 | east boston | 8 |
444 | Griffin | 6 |
445 | DAVENPORT | 15 |
446 | Lake Buena Visa | 13 |
447 | Orlanto | 5 |
448 rows × 2 columns
from geopy.geocoders import Nominatim
= Nominatim(user_agent="city_mapper")
geolocator def geocode(city):
try:
= geolocator.geocode(city)
location return location.latitude, location.longitude
except:
return None, None
'latitude'], df1['longitude'] = zip(*df1['city'].apply(geocode)) df1[
import geopandas as gpd
import matplotlib.pyplot as plt
= df1.dropna(subset=['latitude', 'longitude'])
df1
# Load a map of the world
= gpd.read_file("C:/Users/akash/Code/project-20240826T060102Z-001/project/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp")
world
# Plot the world map
# Plot the world map
= plt.subplots(figsize=(15, 10))
fig, ax =ax, linewidth=1, color='black') # Plot national borders
world.boundary.plot(ax= gpd.read_file("C:/Users/akash/Code/project-20240826T060102Z-001/project/ne_110m_admin_0_countries/ne_110m_populated_places.shp")
world # Adjust marker sizes and transparency
= 0.005 # Reduce this factor to make the circles smaller
scaling_factor =ax, color='red', markersize=gdf['review_count'] * scaling_factor, alpha=0.5)
gdf.plot(ax
= gdf.nlargest(10, 'review_count')
top_10
# Add labels for only the top 10 places
for x, y, label in zip(top_20.geometry.x, top_20.geometry.y, top_20['city']):
=label, fontsize=6, ha='center', va='center', color='black', alpha=1)
ax.text(x, y, s
plt.show()
= con.sql("""SELECT
df2 r.state,
COUNT(rv.int_rest_review_id) AS review_count
FROM
restos r
JOIN
resto_reviews rv
ON
r.int_business_id = rv.int_business_id
GROUP BY
r.state""").df()
df2
state | review_count | |
---|---|---|
0 | OH | 86476 |
1 | OR | 266231 |
2 | VA | 2 |
3 | CO | 26556 |
4 | BC | 122825 |
5 | WY | 1 |
6 | MN | 2 |
7 | KS | 4 |
8 | FL | 221779 |
9 | TX | 249741 |
10 | MA | 393942 |
11 | GA | 229046 |
12 | WA | 20484 |
13 | NH | 4 |
14 | KY | 1 |
15 | ABE | 3 |
from geopy.geocoders import Nominatim
= Nominatim(user_agent="state_mapper")
geolocator def geocode(state):
try:
= geolocator.geocode(state)
location return location.latitude, location.longitude
except:
return None, None
'latitude'], df2['longitude'] = zip(*df2['state'].apply(geocode))
df2[ df2
state | review_count | latitude | longitude | |
---|---|---|---|---|
0 | OH | 86476 | 40.225357 | -82.688140 |
1 | OR | 266231 | 43.979280 | -120.737257 |
2 | VA | 2 | 41.903411 | 12.452853 |
3 | CO | 26556 | 4.099917 | -72.908813 |
4 | BC | 122825 | 55.001251 | -125.002441 |
5 | WY | 1 | 43.170026 | -107.568534 |
6 | MN | 2 | 46.825039 | 103.849974 |
7 | KS | 4 | 38.273120 | -98.582187 |
8 | FL | 221779 | 27.756767 | -81.463983 |
9 | TX | 249741 | 31.263890 | -98.545612 |
10 | MA | 393942 | 31.172821 | -7.336248 |
11 | GA | 229046 | -0.899969 | 11.689970 |
12 | WA | 20484 | -25.230300 | 121.018725 |
13 | NH | 4 | 52.721283 | 4.820665 |
14 | KY | 1 | 19.703182 | -79.917463 |
15 | ABE | 3 | 48.891946 | 2.245066 |
import geopandas as gpd
import matplotlib.pyplot as plt
= df2.dropna(subset=['latitude', 'longitude'])
df2 = gpd.GeoDataFrame(df2, geometry=gpd.points_from_xy(df2.longitude, df2.latitude))
gdf # Load a map of the world
= gpd.read_file("C:/Users/akash/Code/project-20240826T060102Z-001/project/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp")
world
# Plot the world map
# Plot the world map
= plt.subplots(figsize=(15, 10))
fig, ax =ax, linewidth=1, color='black') # Plot national borders
world.boundary.plot(ax= gpd.read_file("C:/Users/akash/Code/project-20240826T060102Z-001/project/ne_110m_admin_0_countries/ne_110m_admin_1_states_provinces_lines.shp")
world
# Adjust marker sizes and transparency
= 0.005 # Reduce this factor to make the circles smaller
scaling_factor =ax, color='red', markersize=gdf['review_count'] * scaling_factor, alpha=0.5)
gdf.plot(ax
for x, y, label in zip(gdf.geometry.x, gdf.geometry.y, gdf['state']):
=label, fontsize=8, ha='center', va='center', color='black', alpha=0.7)
ax.text(x, y, s
plt.show()
<h1>Focussing on USA states as majority are from USA
</h1>
= con.sql("""
df_temp SELECT state, avg(stars) as Mean_stars FROM restos
GROUP BY state
""").df()
= px.choropleth(df_temp,
fig =df_temp["state"],
locations="USA-states",
locationmode="usa",
scope=df_temp["Mean_stars"],
color="greens",
color_continuous_scale
) fig.show()
By going through these plots and reviews we can understand the places where avg stars are low and where more people are giving reviews. So we can open a good restaurant where the avg stars are low but the number of reviews are high like in BC or OH or GA. As it shows that there the people are more so demand is high and not so many good quality restaurants are there
<h1>Descriptive Analysis
</h1></div>
"""
con.sql( SELECT state, avg(stars) as Mean_stars, median(stars) as Median_stars, mode(stars) as Mode_stars, stddev_samp(stars) as Standard_deviation FROM restos
GROUP BY state
ORDER BY Mean_stars DESC
""")
┌─────────┬────────────────────┬──────────────┬────────────┬────────────────────┐
│ state │ Mean_stars │ Median_stars │ Mode_stars │ Standard_deviation │
│ varchar │ double │ double │ double │ double │
├─────────┼────────────────────┼──────────────┼────────────┼────────────────────┤
│ ABE │ 4.5 │ 4.5 │ 4.5 │ NULL │
│ NH │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ OR │ 3.7520742511601743 │ 4.0 │ 4.0 │ 0.7493932759028515 │
│ TX │ 3.6562618956985156 │ 4.0 │ 4.0 │ 0.7900977902196283 │
│ CO │ 3.587216248506571 │ 3.5 │ 4.0 │ 0.7164616119919883 │
│ FL │ 3.5210299272040984 │ 3.5 │ 4.0 │ 0.8193855357786395 │
│ MN │ 3.5 │ 3.5 │ 3.5 │ NULL │
│ MA │ 3.480818540433925 │ 3.5 │ 3.5 │ 0.7230047660400158 │
│ BC │ 3.454463292547275 │ 3.5 │ 3.5 │ 0.6927356872193546 │
│ WA │ 3.4502018842530284 │ 3.5 │ 4.0 │ 0.8201944658940568 │
│ OH │ 3.421852731591449 │ 3.5 │ 4.0 │ 0.8283107681322551 │
│ GA │ 3.399796195652174 │ 3.5 │ 3.5 │ 0.7965101129423892 │
│ KY │ 2.0 │ 2.0 │ 2.0 │ NULL │
│ VA │ 2.0 │ 2.0 │ 2.0 │ NULL │
│ KS │ 2.0 │ 2.0 │ 2.0 │ NULL │
│ WY │ 1.5 │ 1.5 │ 1.5 │ NULL │
├─────────┴────────────────────┴──────────────┴────────────┴────────────────────┤
│ 16 rows 5 columns │
└───────────────────────────────────────────────────────────────────────────────┘
"""
con.sql( SELECT city, avg(stars) as Mean_stars, median(stars) as Median_stars, mode(stars) as Mode_stars, stddev_samp(stars) as Standard_deviation FROM restos
GROUP BY city
ORDER BY Mean_stars DESC
""")
┌───────────────────┬────────────┬──────────────┬────────────┬────────────────────┐
│ city │ Mean_stars │ Median_stars │ Mode_stars │ Standard_deviation │
│ varchar │ double │ double │ double │ double │
├───────────────────┼────────────┼──────────────┼────────────┼────────────────────┤
│ Wekiva Springs │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ columbus │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Jackson │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Winter park │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Brookwood │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Gotha │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ St Johns │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ portland │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Northeast Orlando │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Union Park │ 4.75 │ 4.75 │ 4.5 │ 0.3535533905932738 │
│ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │
│ Rockledge │ 2.0 │ 2.0 │ 2.0 │ NULL │
│ PORT COQUITLAM │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Southeast Orlando │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ WINTER GARDEN │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Sheridan │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ E.Point │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Candler-McAfee │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Englewood │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Clarkson │ 1.5 │ 1.5 │ 1.5 │ NULL │
│ Miami Beach │ 1.0 │ 1.0 │ 1.0 │ NULL │
├───────────────────┴────────────┴──────────────┴────────────┴────────────────────┤
│ 452 rows (20 shown) 5 columns │
└─────────────────────────────────────────────────────────────────────────────────┘
"""
con.sql( SELECT categories, avg(stars) as Mean_stars, median(stars) as Median_stars, mode(stars) as Mode_stars, stddev_samp(stars) as Standard_deviation FROM restos
GROUP BY categories
ORDER BY Mean_stars DESC
""")
┌────────────────────────────────────────────────────────┬────────────┬──────────────┬────────────┬────────────────────┐
│ categories │ Mean_stars │ Median_stars │ Mode_stars │ Standard_deviation │
│ varchar │ double │ double │ double │ double │
├────────────────────────────────────────────────────────┼────────────┼──────────────┼────────────┼────────────────────┤
│ Desserts, French, Food, Restaurants, Creperies │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Coffee & Tea, Sandwiches, Restaurants, Food, Coffee … │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Falafel, Middle Eastern, Lebanese, Restaurants, Gree… │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Mexican, Tacos, Food, Restaurants, Food Trucks │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Middle Eastern, Arabian, Kebab, Mediterranean, Halal… │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Restaurants, Food, Korean, Food Trucks │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Middle Eastern, Food, Turkish, Restaurants │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Bars, Nightlife, Arts & Entertainment, American (Tra… │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Food Trucks, Street Vendors, Breakfast & Brunch, Mex… │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ Restaurants, Delicatessen, Delis, Food │ 5.0 │ 5.0 │ 5.0 │ NULL │
│ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │
│ American (New), Hot Dogs, Vegan, Burgers, American (… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Food Trucks, Seafood, Food, Seafood Markets, Restaur… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Poke, Food, Salad, Restaurants, Hawaiian │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Korean, Food Trucks, Street Vendors, American (New),… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Soup, Restaurants, Sandwiches, American (New) │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Coffee & Tea, Breakfast & Brunch, Restaurants, Food,… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Breakfast & Brunch, Restaurants, Coffee & Tea, Food,… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Food, Thai, Bars, Nightlife, Taiwanese, Restaurants,… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Pubs, Bars, American (Traditional), Nightlife, Tapas… │ 4.0 │ 4.0 │ 4.0 │ NULL │
│ Food, Cafes, Coffee & Tea, Sandwiches, Restaurants │ 4.0 │ 4.0 │ 4.0 │ NULL │
├────────────────────────────────────────────────────────┴────────────┴──────────────┴────────────┴────────────────────┤
│ ? rows (>9999 rows, 20 shown) 5 columns │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
how many restaurants provide wifi
"""
con.sql( SELECT DISTINCT "attributes.WiFi" FROM restos
""").df()
attributes.WiFi | |
---|---|
0 | u'no' |
1 | u'paid' |
2 | 'paid' |
3 | None |
4 | 'no' |
5 | u'free' |
6 | 'free' |
7 | None |
cleaning the data
= con.sql("""
df_temp SELECT "attributes.WiFi" AS wifi, count("attributes.WiFi") AS count FROM restos
GROUP BY wifi
""").df()
df_temp
wifi | count | |
---|---|---|
0 | 'free' | 5496 |
1 | u'paid' | 184 |
2 | 'no' | 4394 |
3 | None | 29 |
4 | None | 0 |
5 | u'free' | 14994 |
6 | u'no' | 11706 |
7 | 'paid' | 81 |
= pd.DataFrame({
df_new "wifi": [
"None",
"Free",
"No",
"Paid"
],"count": [
"count"][0] + df_temp["count"][7]),
(df_temp["count"][1] + df_temp["count"][3]),
(df_temp["count"][2] + df_temp["count"][5]),
(df_temp["count"][4] + df_temp["count"][6])
(df_temp[
]
}) df_new
wifi | count | |
---|---|---|
0 | None | 5577 |
1 | Free | 213 |
2 | No | 19388 |
3 | Paid | 11706 |
finding the usefull votes received by reviews grouped by star ratings
= con.sql("""
df_temp SELECT stars, sum(useful) as sum_useful, avg(useful) as avg_useful, max(useful) AS max_useful FROM resto_reviews
GROUP BY stars
ORDER BY stars ASC
""").df()
df_temp
stars | sum_useful | avg_useful | max_useful | |
---|---|---|---|---|
0 | 1 | 239163.0 | 1.256445 | 146 |
1 | 2 | 184718.0 | 1.192214 | 260 |
2 | 3 | 216230.0 | 1.000694 | 177 |
3 | 4 | 454666.0 | 1.036895 | 411 |
4 | 5 | 556257.0 | 0.825011 | 223 |
"husl")
sns.set_palette("talk")
sns.set_context(= df_temp, x = "stars", y = "sum_useful", kind = "bar") sns.catplot(data
"husl")
sns.set_palette(= df_temp, x = "stars", y = "avg_useful", kind = "bar") sns.catplot(data
= con.sql("""
df_table WITH BusinessCategories AS (
SELECT UNNEST(STRING_TO_ARRAY(categories, ',')) AS category
FROM restos
WHERE stars >= 4
)
SELECT category, COUNT(*) AS count
FROM BusinessCategories
GROUP BY category
ORDER BY count DESC;
""").df()
'category'] = df_table['category'].str.strip()
df_table[= df_table.groupby('category', as_index=False).sum()
df_grouped = df_grouped.sort_values(by='count', ascending=False)
df_sorted = df_sorted.head(20)
df_top20 df_sorted
category | count | |
---|---|---|
470 | Restaurants | 21029 |
228 | Food | 8735 |
390 | Nightlife | 3570 |
53 | Bars | 3428 |
477 | Sandwiches | 3158 |
... | ... | ... |
452 | Psychics | 1 |
453 | Pub Food | 1 |
216 | Fences & Gates | 1 |
455 | Public Relations | 1 |
96 | Cabinetry | 1 |
617 rows × 2 columns
=(12, 6)) # Adjust the width (12) and height (6) as needed
plt.figure(figsize= df_top20, x = "category", y = "count", palette='viridis',kind = "bar",height= 8, aspect=2.5)
sns.catplot(data # Step 3: Create a histogram plot using seaborn
# Step 4: Rotate x-axis labels if needed (optional)
=45) # Rotate labels if they overlap, adjust the angle as needed
plt.xticks(rotation
# Step 5: Customize the plot
'Histogram of the First 20 Rows')
plt.title('Value')
plt.xlabel('Frequency')
plt.ylabel(
# Step 6: Add padding around the plot
plt.tight_layout()
# Step 7: Show the plot
plt.show()
<Figure size 1200x600 with 0 Axes>
from this we can see resaurants with higher stars are categorised mainly in restaurant, food, nightlife, bar so a restaurant cum bar which has great food and nightlife seems a good option
We’ll try to find if there are correalation between ranks and some other variables using Spearman rank correalation.
def spear_correal(df,columnName):
'None',inplace = True)
df.fillna(sum().sum()
df.isnull().= df[df[columnName].isin(['True', 'False'])]
df1 = df1.copy()
df1 = df1[columnName].apply(lambda x: str(x))
df1[columnName] = df_temp[columnName].map({'True': 1, 'False': 0})
df1[columnName]
# Calculate Spearman correlation
= spearmanr(df1['stars'], df1[columnName])
correlation, p_value
print(f"Spearman Correlation for Stars and {columnName}: {correlation}")
print(f"P-Value: {p_value}")
from scipy.stats import spearmanr
= con.sql("""
df_temp SELECT stars, "attributes.OutdoorSeating" AS outdoorSetting FROM restos
ORDER BY stars ASC
""").df()
'outdoorSetting') spear_correal(df_temp,
Spearman Correlation for Stars and outdoorSetting: 0.16219573052211875
P-Value: 1.1884858999560221e-236
= con.sql("""
df_temp SELECT stars, "attributes.GoodForKids" AS GoodForKids FROM restos
ORDER BY stars ASC
""").df()
'GoodForKids') spear_correal(df_temp,
Spearman Correlation for Stars and GoodForKids: 0.0066133919328259915
P-Value: 0.18340820709810265
= con.sql("""
df_temp SELECT stars, "attributes.RestaurantsTableService" AS RestaurantsTableService FROM restos
ORDER BY stars ASC
""").df()
'RestaurantsTableService') spear_correal(df_temp,
Spearman Correlation for Stars and RestaurantsTableService: -0.0994801035971547
P-Value: 6.084927694546583e-42
= con.sql("""
df_temp SELECT stars, "attributes.RestaurantsGoodForGroups" AS RestaurantsGoodForGroups FROM restos
ORDER BY stars ASC
""").df()
'RestaurantsGoodForGroups') spear_correal(df_temp,
Spearman Correlation for Stars and RestaurantsGoodForGroups: -0.0026774151362220916
P-Value: 0.5880048903696755
= con.sql("""
df_temp SELECT stars, "attributes.Wifi" AS Wifi FROM restos
ORDER BY stars ASC
""").df()
def clean_value(value):
if value is None or value.lower() == 'none':
return None
elif 'free' in value.lower():
return 'True'
elif 'no' in value.lower():
return 'False'
elif 'paid' in value.lower():
return 'paid'
else:
return value
# Apply the function to the column
'Wifi'] = df_temp['Wifi'].apply(clean_value)
df_temp[
# Display the cleaned DataFrame
'Wifi') spear_correal(df_temp,
Spearman Correlation for Stars and Wifi: 0.03168203012314857
P-Value: 1.3476091835891137e-09
= con.sql("""
df_temp SELECT stars, "attributes.BusinessAcceptsCreditCards" AS BusinessAcceptsCreditCards FROM restos
ORDER BY stars ASC
""").df()
'BusinessAcceptsCreditCards') spear_correal(df_temp,
Spearman Correlation for Stars and BusinessAcceptsCreditCards: -0.07706876996276704
P-Value: 3.59912693693516e-53
= con.sql("""
df_temp SELECT
user_friends.int_user_id,
user_friends.num_friends,
users.useful
FROM
user_friends
JOIN
users
ON
users.int_user_id = user_friends.int_user_id;
""").df()
df_temp'num_friends'], df_temp['useful'])
plt.scatter(df_temp['Number of Friends')
plt.xlabel('Useful')
plt.ylabel('Scatter Plot of Number of Friends vs. Useful') plt.title(
Text(0.5, 1.0, 'Scatter Plot of Number of Friends vs. Useful')
<h1> Pie Chart of Stars
</h1></div>
= con.sql("""
df_temp SELECT stars, count(stars) AS starsCount FROM resto_reviews
GROUP BY stars
ORDER BY stars ASC
""").df()
df_temp= sns.color_palette('husl')
colors ='starsCount', labels=df_temp['stars'], autopct='%1.1f%%', figsize=(8, 8), colors = colors)
df_temp.plot.pie(y
# Set title
'Pie Chart of Stars')
plt.title(
# Show the plot
plt.show()
<h1>Hypothesis Testing
</h1></div>
= con.sql("""
df_restos SELECT int_business_id,name,address,city,state,postal_code,stars,review_count,is_open,"attributes.RestaurantsTableService","attributes.BikeParking","attributes.BusinessAcceptsCreditCards","attributes.RestaurantsReservations","attributes.OutdoorSeating","attributes.RestaurantsTakeOut","attributes.GoodForKids","attributes.Open24Hours" FROM restos;
""").df()
= []
data_attributes =[]
arr=[]
t_stat=[]
p_val=[]
sign=[]
significant_attributes
for i in df_restos.columns:
if i.startswith("attributes"):
print("attribute Name: {}".format(i) )
print("attribute unique values : {}".format(df_restos[i].unique()))
data_attributes.append(i)
print(data_attributes)
= len(df_restos)
rows_count
def hypothesis_test(grp1, grp2, attribute):
= grp1 # Ratings for group 1
group1 = grp2
group2 = spst.ttest_ind(group1, group2, alternative='two-sided')
t_statistic, p_value
arr.append(attribute)
t_stat.append(t_statistic)
p_val.append(p_value)bool=True
if p_value<0.1:
bool=True
significant_attributes.append(attribute)else:
bool=False
bool)
sign.append(print(f"T-Statistic: {t_statistic}")
print(f"P-Value: {p_value}")
for attribute in data_attributes:
=[]
grp1=[]
grp2for index in range(rows_count):
= df_restos.loc[index,attribute]
attribute_value if (attribute_value == "True" or attribute_value == True):
"stars"])
grp1.append(df_restos.loc[index,elif(attribute_value == "False" or attribute_value == False):
"stars"])
grp2.append(df_restos.loc[index,
hypothesis_test(grp1,grp2,attribute)
= pd.DataFrame({'attributes':arr, 't-stat':t_stat, 'p-value':p_val, 'Is-significant(p<alpha)':sign})
df_results df_results
attribute Name: attributes.RestaurantsTableService
attribute unique values : ['True' 'Unknown' 'False' 'None']
attribute Name: attributes.BikeParking
attribute unique values : ['True' 'False' 'Unknown' 'None']
attribute Name: attributes.BusinessAcceptsCreditCards
attribute unique values : ['True' 'Unknown' 'False' 'None']
attribute Name: attributes.RestaurantsReservations
attribute unique values : ['False' 'True' 'Unknown' 'None']
attribute Name: attributes.OutdoorSeating
attribute unique values : ['True' 'False' 'Unknown' 'None']
attribute Name: attributes.RestaurantsTakeOut
attribute unique values : ['True' 'Unknown' 'None' 'False']
attribute Name: attributes.GoodForKids
attribute unique values : ['Unknown' 'True' 'False' 'None']
attribute Name: attributes.Open24Hours
attribute unique values : ['Unknown' False True]
['attributes.RestaurantsTableService', 'attributes.BikeParking', 'attributes.BusinessAcceptsCreditCards', 'attributes.RestaurantsReservations', 'attributes.OutdoorSeating', 'attributes.RestaurantsTakeOut', 'attributes.GoodForKids', 'attributes.Open24Hours']
T-Statistic: -7.372668695798382
P-Value: 1.742929168567777e-13
T-Statistic: 27.617727216768504
P-Value: 4.8420795506096176e-166
T-Statistic: -14.802489839883231
P-Value: 1.9152766670078407e-49
T-Statistic: 14.247260843911603
P-Value: 5.9719676544221295e-46
T-Statistic: 34.46783403696211
P-Value: 1.2772571268991985e-256
T-Statistic: -2.9917912366476616
P-Value: 0.0027749658574526786
T-Statistic: 1.39949765413476
P-Value: 0.16167146721429418
T-Statistic: 0.0
P-Value: 1.0
attributes | t-stat | p-value | Is-significant(p<alpha) | |
---|---|---|---|---|
0 | attributes.RestaurantsTableService | -7.372669 | 1.742929e-13 | True |
1 | attributes.BikeParking | 27.617727 | 4.842080e-166 | True |
2 | attributes.BusinessAcceptsCreditCards | -14.802490 | 1.915277e-49 | True |
3 | attributes.RestaurantsReservations | 14.247261 | 5.971968e-46 | True |
4 | attributes.OutdoorSeating | 34.467834 | 1.277257e-256 | True |
5 | attributes.RestaurantsTakeOut | -2.991791 | 2.774966e-03 | True |
6 | attributes.GoodForKids | 1.399498 | 1.616715e-01 | False |
7 | attributes.Open24Hours | 0.000000 | 1.000000e+00 | False |
from scipy.stats import f_oneway
= [group['stars'].values for name, group in df_restos.groupby('state')]
grouped_by_state = f_oneway(*grouped_by_state) f_statistic_state, p_value_state
print(f"State-wide Analysis: F-statistic = {f_statistic_state}, p-value = {p_value_state}")
if p_value_state < 0.05:
print("Reject the null hypothesis: There is a significant difference in average reviews across states.")
else:
print("Fail to reject the null hypothesis: No significant difference in average reviews across states.")
State-wide Analysis: F-statistic = 76.2248728686078, p-value = 1.4925442502613572e-231
Reject the null hypothesis: There is a significant difference in average reviews across states.
<h1>ANOVA Test
</h1></div>
Null Hypothesis (H0): The mean restaurant ratings are the same across different States.
Alternative Hypothesis (H1): The mean restaurant ratings differ across States.
= con.sql("""
df_state SELECT state, stars as stars FROM restos
ORDER BY stars DESC
""").df()
df_state
state | stars | |
---|---|---|
0 | TX | 5.0 |
1 | OR | 5.0 |
2 | FL | 5.0 |
3 | OR | 5.0 |
4 | OH | 5.0 |
... | ... | ... |
48795 | FL | 1.0 |
48796 | OR | 1.0 |
48797 | OR | 1.0 |
48798 | GA | 1.0 |
48799 | OH | 1.0 |
48800 rows × 2 columns
from statsmodels.formula.api import ols
= ols('stars ~ C(state)', data=df_state).fit()
model
# Perform ANOVA
= sm.stats.anova_lm(model, typ=2)
anova_table
# Print the results
print(anova_table)
sum_sq df F PR(>F)
C(state) 669.225082 15.0 76.224873 1.492544e-231
Residual 28553.651142 48784.0 NaN NaN
Here we can see, p value is very much lesser than alpha which is 0.05, hence we rejet the null hypothesis, and conclude that The mean restaurant ratings differ across states
Null Hypothesis (H0): The mean restaurant ratings are the same across different Cities.
Alternative Hypothesis (H1): The mean restaurant ratings differ across Cities.
= con.sql("""
df_city SELECT city, stars FROM restos
""").df()
df_city
city | stars | |
---|---|---|
0 | Boulder | 4.0 |
1 | Portland | 4.0 |
2 | Vancouver | 3.5 |
3 | Columbus | 4.5 |
4 | Peabody | 4.0 |
... | ... | ... |
48795 | Tigard | 2.0 |
48796 | Bee Cave | 3.0 |
48797 | Atlanta | 3.0 |
48798 | Portland | 4.0 |
48799 | Altamonte Springs | 4.5 |
48800 rows × 2 columns
= ols('stars ~ C(city)', data=df_city).fit()
model
# Perform ANOVA
= sm.stats.anova_lm(model, typ=2)
anova_table
# Print the results
print(anova_table)
sum_sq df F PR(>F)
C(city) 1570.357546 451.0 6.087876 5.288212e-313
Residual 27652.518679 48348.0 NaN NaN
Here we can see, p value is very much lesser than alpha which is 0.05, hence we rejet the null hypothesis, and conclude that The mean restaurant ratings differ across Cities
<h1> Confidence Interval</h1></div>
= []
attribute_list = []
lower_interval_list = []
upper_interval_list def get_praportionality(df,attribute):
= len(df)
n #using alpha as 0.05
= 1.96
z = len(df[(df[attribute] == 'True') | (df[attribute] == True)])
true_count = true_count/n
p_bar = np.sqrt(p_bar * (1 - p_bar) / n)
se = z*se
moe = (p_bar - moe,p_bar + moe)
confidence_interval return confidence_interval
for attribute in significant_attributes:
= get_praportionality(df_restos,attribute)
confidence_interval
attribute_list.append(attribute)0])
lower_interval_list.append(confidence_interval[1])
upper_interval_list.append(confidence_interval[
= pd.DataFrame({'attributes':attribute_list, 'lower_interval':lower_interval_list, 'upper_interval':upper_interval_list})
df_results df_results
attributes | lower_interval | upper_interval | |
---|---|---|---|
0 | attributes.RestaurantsTableService | 0.218280 | 0.225654 |
1 | attributes.BikeParking | 0.534245 | 0.543091 |
2 | attributes.BusinessAcceptsCreditCards | 0.783974 | 0.791231 |
3 | attributes.RestaurantsReservations | 0.288607 | 0.296680 |
4 | attributes.OutdoorSeating | 0.402568 | 0.411285 |
5 | attributes.RestaurantsTakeOut | 0.870948 | 0.876839 |
<h1> Regression</h1></div>
def regressionTest(df):
= ['stars', 'review_count']
columns_to_exclude = [col for col in df.columns if col not in columns_to_exclude]
binary_columns = df[binary_columns].apply(lambda x: all(val in ['True', 'False'] for val in x), axis=1)
mask = df[mask]
filtered_df for col in binary_columns:
= filtered_df[col].replace({'True': 1, 'False': 0})
filtered_df.loc[:, col] for col in filtered_df.columns:
if filtered_df[col].isin(['0', '1',1,0]).all():
= filtered_df[col].astype(int)
filtered_df[col]
= [col for col in df.columns if col != 'stars']
indipendant_column = filtered_df[indipendant_column] # Quantitative and binary predictors
X = filtered_df['stars'] # Quantitative outcome
y
print(filtered_df.dtypes)
# Add a constant to the model (intercept)
= sm.add_constant(X)
X
# Fit the multiple linear regression model
= sm.OLS(y, X).fit()
model
# Print the summary of the regression
print(model.summary())
= con.sql("""
df_temp SELECT stars, review_count, "attributes.BusinessAcceptsCreditCards" AS BusinessAcceptsCreditCards, "attributes.OutdoorSeating" AS OutdoorSeating,"attributes.BikeParking" AS BikeParking, "attributes.RestaurantsReservations" AS RestaurantsReservations, "attributes.RestaurantsTakeOut" AS RestaurantsTakeOut
FROM restos
ORDER BY stars ASC
""").df()
regressionTest(df_temp)
stars float64
review_count int64
BusinessAcceptsCreditCards int32
OutdoorSeating int32
BikeParking int32
RestaurantsReservations int32
RestaurantsTakeOut int32
dtype: object
OLS Regression Results
==============================================================================
Dep. Variable: stars R-squared: 0.075
Model: OLS Adj. R-squared: 0.075
Method: Least Squares F-statistic: 338.4
Date: Sat, 31 Aug 2024 Prob (F-statistic): 0.00
Time: 20:20:39 Log-Likelihood: -26763.
No. Observations: 25046 AIC: 5.354e+04
Df Residuals: 25039 BIC: 5.360e+04
Df Model: 6
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
const 3.6810 0.037 100.705 0.000 3.609 3.753
review_count 0.0004 2.01e-05 18.138 0.000 0.000 0.000
BusinessAcceptsCreditCards -0.4209 0.031 -13.688 0.000 -0.481 -0.361
OutdoorSeating 0.1991 0.009 21.609 0.000 0.181 0.217
BikeParking 0.2205 0.011 20.221 0.000 0.199 0.242
RestaurantsReservations 0.1030 0.010 10.463 0.000 0.084 0.122
RestaurantsTakeOut -0.0953 0.022 -4.374 0.000 -0.138 -0.053
==============================================================================
Omnibus: 710.077 Durbin-Watson: 0.149
Prob(Omnibus): 0.000 Jarque-Bera (JB): 771.235
Skew: -0.428 Prob(JB): 3.38e-168
Kurtosis: 3.084 Cond. No. 2.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.