#import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf

#read in bookshop data
df = pd.read_csv(r'TotalFictionBookstore_Top_Reads_2022.csv')

#applying correct type to columns 
int_cols = ['YearPublished','NumPages','UnitsSold','Revenue']
float_cols = ['Rating']

#defining data type
df[int_cols] = df[int_cols].astype(int)
df[float_cols] = df[float_cols].astype(float)

#Describe dataset
print(df.columns)
print(df.info())
print(df.describe())
print(df.shape)

Index(['Title', 'YearPublished', 'NumPages', 'Genres', 'Rating', 'Author',
       'UnitsSold', 'Revenue'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          250 non-null    object 
 1   YearPublished  250 non-null    int32  
 2   NumPages       250 non-null    int32  
 3   Genres         250 non-null    object 
 4   Rating         250 non-null    float64
 5   Author         250 non-null    object 
 6   UnitsSold      250 non-null    int32  
 7   Revenue        250 non-null    int32  
dtypes: float64(1), int32(4), object(3)
memory usage: 11.8+ KB
None
       YearPublished    NumPages      Rating    UnitsSold       Revenue
count     250.000000  250.000000  250.000000   250.000000    250.000000
mean     1958.816000  389.088000    8.249600   281.248000   6769.884000
std        34.793339   97.442271    0.294699   234.657407   6508.702844
min      1900.000000  204.000000    6.600000    18.000000    372.000000
25%      1928.250000  318.000000    8.100000    87.500000   1975.250000
50%      1958.000000  375.000000    8.200000   211.500000   5016.000000
75%      1991.000000  438.000000    8.400000   408.750000   8869.000000
max      2019.000000  963.000000    9.300000  1212.000000  32941.000000
(250, 8)

df.head()

#decade analysis
decade_input = """
SELECT TITLE, YEARPUBLISHED,
(YEARPUBLISHED/10) * 10 AS DECADE,
NUMPAGES, GENRES, RATING, AUTHOR, UNITSSOLD,
REVENUE
from df
"""

books_output = sqldf(decade_input,locals())
bookshop = pd.DataFrame(books_output)

bookshop

#check revenue by decade 
gross_by_decade = """
SELECT DECADE, SUM(REVENUE) AS TOTAL_REVENUE,
ROUND(AVG(REVENUE),0) AS AVG_REVENUE
FROM bookshop
GROUP BY DECADE
ORDER BY TOTAL_REVENUE DESC
"""

agg_decade = sqldf(gross_by_decade,locals())
agg_decade

agg_decade = pd.DataFrame(agg_decade)
agg_decade['DECADE']=agg_decade['DECADE'].astype(str)

# Creating primary y-axis (total revenue)
fig, ax1 = plt.subplots(figsize=(15, 9))
ax1.bar(agg_decade['DECADE'], agg_decade['TOTAL_REVENUE'], 
        width = 0.4, align='center', label='Total Revenue (Dollars)')
ax1.set_xlabel('Decade')
ax1.set_ylabel('Total Revenue (Dollars)', color='blue')

# Adjusting gridlines on the primary y-axis
ax1.grid(color='blue', linestyle='--', linewidth=0.5, alpha=0.5)

# Creating secondary y-axis (avg revenue)
ax2 = ax1.twinx()
ax2.scatter(agg_decade['DECADE'], agg_decade['AVG_REVENUE'], 
         marker='o', color='red', label='Avg Revenue (Dollars)')
ax2.set_ylabel('Avg Revenue (Dollars)', color='red')

# Adjusting gridlines on the secondary y-axis
ax2.grid(color='red', linestyle='--', linewidth=0.5, alpha=0.5)

# Setting the same y-axis limits for both ax1 and ax2
ax1.set_ylim(0, 1.1*max(agg_decade['TOTAL_REVENUE']))
ax2.set_ylim(0, 1.1*max(agg_decade['AVG_REVENUE']))

# Combining legends for both axes
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')

# Set title
plt.title('Total and Avg Revenue by Decade')

# Show the plot
plt.show()

# top 10 authors in bookshop
author_books = """
SELECT AUTHOR,
COUNT(TITLE) AS NUM_BOOKS,
SUM(REVENUE) AS TOTAL_REVENUE,
ROUND(AVG(REVENUE),0) AS AVG_REVENUE,
ROUND(AVG(RATING),2) AS AVG_RATING_PER_BOOK
FROM bookshop
GROUP BY AUTHOR
ORDER BY TOTAL_REVENUE DESC
LIMIT 10
"""

author_books = sqldf(author_books,locals())
author_books

result = bookshop.groupby('Author') \
                 .agg({
                     'Title': 'count',
                     'Revenue': ['sum', 'mean'],
                     'Rating': 'mean'
                 }) \
                 .reset_index()

result.columns = ['Author', 'Num_Books', 'Total_Revenue', 
                  'Avg_Revenue', 'Avg_Rating_per_Book']

# Sorting the result by 'TOTAL_REVENUE' in descending order
result = result.sort_values('Total_Revenue', ascending=False)

# Selecting the top 10 rows
result_top10 = result.head(10)

result_top10

agg_author = pd.DataFrame(author_books)

# Creating figure and axis
fig1, ax1 = plt.subplots(figsize=(15, 9))

#plotting bar chart of total revenue
ax1.bar(agg_author['Author'], agg_author['TOTAL_REVENUE'], 
        width=0.4, align='center', color='silver', label='Total Revenue (Dollars)')
ax1.set_xlabel('Author')
ax1.set_xticklabels(agg_author['Author'], rotation=-45, ha='left')
ax1.set_ylabel('Total Revenue (Dollars)', color='blue')

# Adjusting gridlines on the primary y-axis
ax1.grid(color='blue', linestyle='--', linewidth=0.5, alpha=0.5)

#creating scatter plot of avg revenue
ax1.scatter(agg_author['Author'], agg_author['AVG_REVENUE'], 
         marker="D", color='blue', label='Avg Revenue per Book (Dollars)')

# Creating scatter plot of avg rating on secondary axis
ax2 = ax1.twinx()
ax2.scatter(agg_author['Author'], agg_author['AVG_RATING_PER_BOOK'], 
         marker='^', color='red', label='Avg Rating per Book')
ax2.set_ylabel('Avg Rating', color='red')

# Adjusting gridlines on the secondary y-axis
ax2.grid(color='red', linestyle='--', linewidth=0.5, alpha=0.5)

# Combining legends for both axes
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper right')

# Set title
plt.title('Top 10 Authors by Revenue, Rating')

# Show the plot
plt.show()

C:\Users\jlenehan\AppData\Local\Temp\ipykernel_21984\3439127178.py:8: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax1.set_xticklabels(agg_author['Author'], rotation=-45, ha='left')

#build in percentile case when statement

pagelength_query = """
SELECT TITLE, REVENUE, NUMPAGES,
CASE
WHEN NUMPAGES < 318 THEN 'Quartile 1'
WHEN NUMPAGES BETWEEN 318 AND 375 THEN 'Quartile 2'
WHEN NUMPAGES BETWEEN 375 AND 438 THEN 'Quartile 3'
WHEN NUMPAGES > 438 THEN 'Quartile 4'
END AS PAGELENGTH_QUARTILE
FROM bookshop

ORDER BY REVENUE DESC
"""

pagelength_output = sqldf(pagelength_query,locals())
pagelength_output

pagelength_output = pd.DataFrame(pagelength_output)

# Set the style for the plots
sns.set(style="whitegrid")

#Setting order of profit buckets
pagelength_bucket_order = ['Quartile 1', 'Quartile 2', 
                        'Quartile 3', 'Quartile 4']

# Create the boxplot
plt.figure(figsize=(16, 10))
sns.boxplot(x='PAGELENGTH_QUARTILE', y='Revenue', 
            data=pagelength_output, order = pagelength_bucket_order, 
            showfliers=True)

# Add labels and title
plt.xlabel('PageLength Quartile')
plt.ylabel('Revenue (Dollars)')
plt.title('Boxplot of Revenue by PageLength Bucket')

# Show the plot
plt.show()

	Title	YearPublished	NumPages	Genres	Rating	Author	UnitsSold	Revenue
0	Whispers in the Mist	1906	426	Mystery	9.3	Olivia Hudson	1212	30978
1	Starlight Chronicles	2017	525	Fantasy	9.2	Ethan Lawson	844	26586
2	Beneath the Surface	1904	456	Thriller	9.0	Ava Mitchell	1204	32941
3	Pillars of Creation	1927	606	Science Fiction	9.0	Ethan Lawson	573	20834
4	The Farthest Harbour	1992	462	Adventure	8.9	Liam Parker	930	25779

	Title	YearPublished	DECADE	NumPages	Genres	Rating	Author	UnitsSold	Revenue
0	Whispers in the Mist	1906	1900	426	Mystery	9.3	Olivia Hudson	1212	30978
1	Starlight Chronicles	2017	2010	525	Fantasy	9.2	Ethan Lawson	844	26586
2	Beneath the Surface	1904	1900	456	Thriller	9.0	Ava Mitchell	1204	32941
3	Pillars of Creation	1927	1920	606	Science Fiction	9.0	Ethan Lawson	573	20834
4	The Farthest Harbour	1992	1990	462	Adventure	8.9	Liam Parker	930	25779
...	...	...	...	...	...	...	...	...	...
245	Eclipse of Illusions	1950	1950	351	Sci-FiRomance	8.0	Sofia Nelson	76	1600
246	Murmurs of the Forgotten	1932	1930	300	Historical Mystery	8.0	Sebastian Callahan	71	1278
247	Sands of Solace	1991	1990	255	Adventure Romance	8.1	Maya Middleton	54	826
248	Veiled Echoes	1919	1910	387	Psychological Thriller	8.1	Kieran Caldwell	42	975
249	Eternal Prophecies	1921	1920	372	Urban Fantasy	8.1	Sophia Reynolds	276	6160

	DECADE	COUNT(TITLE)	TOTAL_REVENUE	AVG_REVENUE
0	1960	24	192945	8039.0
1	1990	22	183387	8336.0
2	2010	27	172651	6394.0
3	1950	24	169071	7045.0
4	1900	18	163058	9059.0
5	1920	26	158715	6104.0
6	1910	22	148131	6733.0
7	1930	21	146515	6977.0
8	1940	20	104058	5203.0
9	1970	18	92875	5160.0
10	2000	18	90404	5022.0
11	1980	10	70661	7066.0

	Author	NUM_BOOKS	TOTAL_REVENUE	AVG_REVENUE	AVG_RATING_PER_BOOK
0	Ava Mitchell	7	152386	21769.0	8.57
1	Emma Hayes	7	85135	12162.0	8.30
2	Liam Parker	5	83160	16632.0	8.44
3	Sophia Reynolds	7	82299	11757.0	8.27
4	Noah Foster	3	81170	27057.0	8.87
5	Ethan Lawson	3	55331	18444.0	8.87
6	Lucas Bennett	3	53248	17749.0	8.50
7	Olivia Hudson	2	51015	25508.0	8.95
8	Lily Cooper	7	44008	6287.0	8.31
9	Mia Campbell	2	35524	17762.0	8.65

	Author	Num_Books	Total_Revenue	Avg_Revenue	Avg_Rating_per_Book
20	Ava Mitchell	7	152386	21769.428571	8.571429
56	Emma Hayes	7	85135	12162.142857	8.300000
97	Liam Parker	5	83160	16632.000000	8.440000
151	Sophia Reynolds	7	82299	11757.000000	8.271429
124	Noah Foster	3	81170	27056.666667	8.866667
57	Ethan Lawson	3	55331	18443.666667	8.866667
105	Lucas Bennett	3	53248	17749.333333	8.500000
130	Olivia Hudson	2	51015	25507.500000	8.950000
100	Lily Cooper	7	44008	6286.857143	8.314286
117	Mia Campbell	2	35524	17762.000000	8.650000

Bookshop Exploratory Data Analysis using SQL¶

Objective¶

Content¶

	Title	Revenue	NumPages	PAGELENGTH_QUARTILE
0	Beneath the Surface	32941	456	Quartile 4
1	Whispers in the Mist	30978	426	Quartile 3
2	Midnight Secrets	30029	603	Quartile 4
3	Hidden in Plain Sight	28451	444	Quartile 4
4	Legacy of Stars	27009	534	Quartile 4
...	...	...	...	...
245	City of Secrets	528	267	Quartile 1
246	Whispers of the Forgotten Past	495	330	Quartile 2
247	Whispers of Desolation	447	414	Quartile 3
248	Sands of Echoes	435	330	Quartile 2
249	Sins of the Echo	372	282	Quartile 1