import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
sns.set_style("darkgrid")
# read data
df = pd.read_csv("data/events.csv")
df['artists'] = df['artists'].str.split(',')
df[["date", "name", "artists", "venue", "city", "price", "rating", "notes"]]
date | name | artists | venue | city | price | rating | notes | |
---|---|---|---|---|---|---|---|---|
0 | 2021-10-12 | ggbbxx tour | [keshi, LANY] | Agannis Arena | Boston | 103.00 | 7.0 | first time seeing keshi; lany was kinda boring... |
1 | 2022-04-17 | Sentiment Tour | [Said the Sky, Kaivon, Midnight Kids] | House of Blues | Boston | 39.25 | 6.0 | first rave; wish i knew as many songs as i do now |
2 | 2022-05-26 | HELL/HEAVEN TOUR | [keshi, rei brown] | Royale | Boston | 130.00 | 10.0 | meet & greet; got his set list and a piece of ... |
3 | 2022-06-25 | NaN | [Illenium, TroyBoi] | Forest Hills Stadium | New York | 216.77 | 10.0 | best rave ive been to (as of 240514) |
4 | 2022-08-20 - 2022-08-21 | Head in the Clouds Festival | [keshi, Chungha, Jay Park, pH-1, Sik-K, Joji, ... | Rose Bowl Stadium | Pasadena | 410.55 | 10.0 | so many asians - i felt at home |
5 | 2022-09-06 | Reach Inside Tour | [Hojean] | Brighton Music Hall | Boston | 20.50 | 4.0 | too chill/low energy for my liking but maybe b... |
6 | 2022-10-02 | The Nicole Tour | [Niki, sundial] | Royale | Boston | 119.51 | 8.0 | meet & greet |
7 | 2022-10-13 | Smithereens Tour | [Joji, dhruv] | Roadrunner | Boston | 157.69 | 8.0 | everyone was singing to slow dancing and glimp... |
8 | 2022-11-14 | Born Pink World Tour | [Blackpink] | Prudential Center | Newark | 279.78 | 7.0 | lowkey underwhelming but at least i got to see... |
9 | 2023-03-09 | Sanctuary Tour | [Dabin, Grabbitz, Apashe] | Roadrunner | Boston | 42.85 | 7.0 | alive x starbright went so hard |
10 | 2023-03-14 | Hell & Back Tour | [keshi, Weston Estate, James Ivy] | Roadrunner | Boston | 215.42 | 10.0 | greatest keshi performance; wish i got meet & ... |
11 | 2023-03-16 | Girls Like Me Don't Cry Tour | [thuy, P-Lo] | Royale | Boston | 24.42 | 5.0 | nothing too special tbh |
12 | 2023-06-06 | NaN | [The Chainsmokers] | Memoire | Everett | 116.00 | 3.0 | waited for like 3 hours; bad crowd |
13 | 2023-07-06 | Ready to Be World Tour | [Twice] | Metlife Stadium | East Rutherford | 272.00 | 10.0 | felt so euphoric; wish they played TT |
14 | 2023-09-02 | Electric Zoo | [Gryffin, Alison Wonderland, Zedd] | Randall's Island Park | New York | 92.85 | 4.0 | pretty much a scam but at least i only had a d... |
15 | 2023-09-02 | Breakaway Boston | [Knock2] | Stage at Suffolk Downs | Boston | 121.06 | 8.0 | so drunk; think we were a little late to his set |
16 | 2023-09-29 | NaN | [Zedd] | Big E Arena | West Springfield | 78.00 | 6.0 | raining really hard; bad crowd |
17 | 2024-01-13 | Infinite Skies | [Armnhmr, Levity] | Big Night Live | Boston | 29.05 | 8.5 | 22nd birthday; knew all their songs |
18 | 2024-03-17 | Nature of Hope Tour | [William Black] | House of Blues | Boston | 40.95 | 9.0 | first time hearing back to u live; pretty much... |
19 | 2024-04-06 | NaN | [Slander] | Big Night Live | Boston | 85.30 | 7.0 | too much trap but i heard back to u again |
20 | 2024-04-26 | NaN | [Knock2] | Big Night Live | Boston | 78.47 | 10.0 | second best rave ive ever been to |
21 | 2024-07-15 | HER World Tour | [IU] | Prudential Center | Newark | 158.12 | NaN | NaN |
22 | 2024-08-02 | NaN | [Seven Lions] | Big Night Live | Boston | NaN | NaN | NaN |
23 | 2024-08-30 - 2024-09-01 | North Coast Music Festival | [Illenium, Knock2, Isoxo, Seven Lions, Slander... | Seatgeek Stadium | Bridgeview | 349.92 | NaN | NaN |
24 | 2024-09-06 - 2024-09-07 | Breakaway Mass | [Illenium, Armnhmr, Tiesto, Ray Volpe, Deadmau... | The Palladium Outdoors | Worcester | 138.74 | NaN | NaN |
25 | 2024-09-20 | Zedd | [Zedd] | MGM Music Hall at Fenway | Boston | NaN | NaN | NaN |
26 | 2024-10-26 | NaN | [Gryffin] | Forest Hills Stadium | New York | 100.51 | NaN | NaN |
# paid events
paid_df = df[df["price"] > 0]
# attended events
attended_df = df.dropna(subset=["rating"], inplace=False)
# concerts only
concerts_df = df[df["event_type"] == "concert"]
# festivals only
festivals_df = df[df["event_type"] == "festival"]
# add year to df
df["year"] = df["date"].apply(lambda x: x[0:4])
# add coords to df
venue_df = pd.read_csv("data/venues.csv")
venue_df['coords'] = venue_df['coords'].str.split(',')
venue_df['coords'] = venue_df['coords'].apply(lambda x: [float(coord) for coord in x])
merged_df = df.merge(venue_df, on="venue", how="left")
df["metro"] = merged_df["metro"]
df["coords"] = merged_df["coords"]
by_ratings = attended_df.sort_values(by="rating", ascending=False)
by_ratings[["rating", "date", "name", "artists", "price"]]
rating | date | name | artists | price | |
---|---|---|---|---|---|
10 | 10.0 | 2023-03-14 | Hell & Back Tour | [keshi, Weston Estate, James Ivy] | 215.42 |
4 | 10.0 | 2022-08-20 - 2022-08-21 | Head in the Clouds Festival | [keshi, Chungha, Jay Park, pH-1, Sik-K, Joji, ... | 410.55 |
13 | 10.0 | 2023-07-06 | Ready to Be World Tour | [Twice] | 272.00 |
20 | 10.0 | 2024-04-26 | NaN | [Knock2] | 78.47 |
3 | 10.0 | 2022-06-25 | NaN | [Illenium, TroyBoi] | 216.77 |
2 | 10.0 | 2022-05-26 | HELL/HEAVEN TOUR | [keshi, rei brown] | 130.00 |
18 | 9.0 | 2024-03-17 | Nature of Hope Tour | [William Black] | 40.95 |
17 | 8.5 | 2024-01-13 | Infinite Skies | [Armnhmr, Levity] | 29.05 |
6 | 8.0 | 2022-10-02 | The Nicole Tour | [Niki, sundial] | 119.51 |
7 | 8.0 | 2022-10-13 | Smithereens Tour | [Joji, dhruv] | 157.69 |
15 | 8.0 | 2023-09-02 | Breakaway Boston | [Knock2] | 121.06 |
19 | 7.0 | 2024-04-06 | NaN | [Slander] | 85.30 |
0 | 7.0 | 2021-10-12 | ggbbxx tour | [keshi, LANY] | 103.00 |
9 | 7.0 | 2023-03-09 | Sanctuary Tour | [Dabin, Grabbitz, Apashe] | 42.85 |
8 | 7.0 | 2022-11-14 | Born Pink World Tour | [Blackpink] | 279.78 |
16 | 6.0 | 2023-09-29 | NaN | [Zedd] | 78.00 |
1 | 6.0 | 2022-04-17 | Sentiment Tour | [Said the Sky, Kaivon, Midnight Kids] | 39.25 |
11 | 5.0 | 2023-03-16 | Girls Like Me Don't Cry Tour | [thuy, P-Lo] | 24.42 |
5 | 4.0 | 2022-09-06 | Reach Inside Tour | [Hojean] | 20.50 |
14 | 4.0 | 2023-09-02 | Electric Zoo | [Gryffin, Alison Wonderland, Zedd] | 92.85 |
12 | 3.0 | 2023-06-06 | NaN | [The Chainsmokers] | 116.00 |
# calculate summary
def print_summary(title, df, rating=False):
events = len(df)
price = df["price"].sum()
avg_price = price / events
print(f"{title}:")
print(f"Count: {events}")
print(f"Price: ${price:.2f}")
print(f"Avg price: ${avg_price:.2f}")
if rating:
avg_rating = df["rating"].mean()
print(f"Avg rating: {avg_rating:.2f}")
print("\n")
print_summary("Total", df)
print_summary("Attended", attended_df, rating=True)
print_summary("Paid", paid_df)
Total: Count: 27 Price: $3420.71 Avg price: $126.69 Attended: Count: 21 Price: $2673.42 Avg price: $127.31 Avg rating: 7.50 Paid: Count: 25 Price: $3420.71 Avg price: $136.83
year_count = df["year"].value_counts().sort_index()
sns.barplot(
x=year_count.index,
y=year_count.values,
palette="flare",
)
plt.title("Events per Year")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()
# create map
map = folium.Map(location=[39.82835, -98.5820546], zoom_start=4)
for index, row in df.iterrows():
# info from df
date = row["date"]
venue = row["venue"]
city = row["city"]
coords = row["coords"]
artists = "N/A"
if row["artists"] is not np.nan:
artists = ", ".join(row["artists"])
# add marker
folium.Marker(
location=coords,
popup=f"<b>Date:</b> {date}<br><b>Artists:</b> {artists}<br><b>Venue:</b> {venue}<br><b>City:</b> {city}",
icon=folium.Icon(color="red", prefix="fa", icon="fa-music"),
).add_to(map)
map
minimum 2 times
venue_count = df["venue"].value_counts()
top_venues = venue_count[venue_count >= 2]
palette = sns.color_palette("flare_r", len(top_venues))
sns.barplot(
x=top_venues.index, y=top_venues.values, palette=palette
)
plt.title("Top Venues")
plt.xlabel("Venue")
plt.ylabel("Count")
plt.yticks(np.arange(0, max(top_venues.values) + 1, 1))
plt.xticks(rotation=-45)
plt.show()
metro_count = df["metro"].value_counts()
palette = sns.color_palette("flare_r", len(metro_count))
sns.barplot(
x=metro_count.index,
y=metro_count.values,
palette=palette,
)
plt.title("Top Metro Areas")
plt.xlabel("Metro Area")
plt.ylabel("Count")
plt.yticks(range(0, int(metro_count.max()) + 2, 2))
plt.show()
city_count = df["city"].value_counts()
palette = sns.color_palette("flare_r", len(city_count))
sns.barplot(
x=city_count.index,
y=city_count.values,
palette=palette,
)
plt.title("Top Cities")
plt.xlabel("City")
plt.ylabel("Count")
plt.xticks(rotation=-45)
plt.show()
# get artists w/ min 2 events
artists_df = attended_df.explode("artists")
artist_counts = artists_df["artists"].value_counts()
top_artists = artist_counts[artist_counts >= 2]
top_artists_df = artists_df[artists_df["artists"].isin(top_artists.index)]
minimum 2 times
palette = sns.color_palette("flare_r", len(top_artists))
sns.barplot(
x=top_artists.index, y=top_artists.values, palette=palette
)
plt.title("Top Artists (count)")
plt.xlabel("Artist")
plt.ylabel("Count")
plt.yticks(np.arange(0, max(top_artists.values) + 1, 1))
plt.xticks(rotation=-45)
plt.show()
minimum 2 times
avg_rating = top_artists_df.groupby('artists')['rating'].mean().reset_index()
avg_rating = avg_rating.sort_values(by="rating", ascending=False)
palette = sns.color_palette("flare_r", len(avg_rating))
sns.barplot(
data=avg_rating,
x="artists",
y="rating",
palette=palette
)
plt.title("Top Artists (rating)")
plt.xlabel("Artist")
plt.ylabel("Average Rating")
plt.yticks(np.arange(0, 11))
plt.xticks(rotation=-45)
plt.show()
bins = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
sns.displot(
data=paid_df,
x="price",
bins=bins,
color="#3CB371",
)
plt.title("Price Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Count")
plt.xlim(0, 530)
plt.show()
sns.scatterplot(
data=attended_df, x="price", y="rating", hue="price", palette="flare", legend=False
)
plt.title("Price vs Rating")
plt.xlabel("Price ($)")
plt.ylabel("Rating")
plt.grid(True)
plt.show()
prices = df.groupby("year")['price'].sum()
sns.barplot(
x=prices.index,
y=prices.values,
palette="flare",
)
for index, value in enumerate(prices.values):
plt.text(index, value + 0.5, f"${value:.2f}", ha='center')
plt.title("Price per Year")
plt.xlabel("Year")
plt.ylabel("Price ($)")
plt.show()
0.5 rounds up
rating_df = attended_df.copy()
rating_df["rating"] = np.ceil(rating_df["rating"]).astype(int)
counts = rating_df["rating"].nunique()
rating_order = np.arange(0, 11)
palette = sns.color_palette('flare', n_colors=11)
sns.countplot(
data=rating_df,
x="rating",
order=rating_order,
palette=palette
)
plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()
genre_count = df['genre'].value_counts()
palette = sns.color_palette("flare_r", len(genre_count))
sns.barplot(
x=genre_count.index,
y=genre_count.values,
palette=palette,
)
plt.title("Top Genres")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.show()
sorted_genres = attended_df.groupby('genre')['rating'].mean().sort_values(ascending=False).index
sns.barplot(
data=attended_df,
x="genre",
y="rating",
palette="flare_r",
order=sorted_genres,
errorbar=None
)
plt.title('Average Rating by Genre')
plt.xlabel('Genre')
plt.ylabel('Rating')
plt.yticks(np.arange(0, 11))
plt.show()
sorted_genres = (
attended_df.groupby("genre")["price"]
.mean()
.sort_values(ascending=False)
.index
)
plot = sns.barplot(
data=attended_df,
x="genre",
y="price",
palette="flare_r",
order=sorted_genres,
errorbar=None,
)
for bar in plot.patches:
height = bar.get_height()
plot.text(
x=bar.get_x() + bar.get_width() / 2,
y=height,
s=f"${height:.2f}",
ha="center",
)
plt.title("Average Price by Genre")
plt.xlabel("Genre")
plt.ylabel("Price ($)")
plt.yticks(np.arange(0, 325, 50))
plt.show()