import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets
import bqplot
import matplotlib.colors as mpl_colors
import seaborn as sns
import datetime


data_2021 = pd.read_csv("https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv", 
                   low_memory=False)


np.random.seed(2022)
nsamples = len(data_2021) // 100
downSampleMask = np.random.choice(range(len(data_2021)-1), 
                                  nsamples, replace=False)
data_2021 = data_2021.loc[downSampleMask]


data_2021.head()


len(data_2021)

13697


len(data_2021.columns)

18


columnsCared = {'tpep_pickup_datetime', 'passenger_count',
               'trip_distance','total_amount'}


for c in columnsCared:
    print(c, data_2021[c].dtype)

tpep_pickup_datetime object
trip_distance float64
total_amount float64
passenger_count float64


data_2021['pickup_date'] = pd.to_datetime(data_2021['tpep_pickup_datetime']).dt.date
data_2021['pickup_date'] = pd.to_datetime(data_2021['pickup_date'])
data_2021['pickup_date'].head()

411971   2021-01-12
406278   2021-01-11
989763   2021-01-25
580385   2021-01-15
542365   2021-01-14
Name: pickup_date, dtype: datetime64[ns]


pd.to_datetime(data_2021['pickup_date']).dt.date.unique()

array([datetime.date(2021, 1, 12), datetime.date(2021, 1, 11),
       datetime.date(2021, 1, 25), datetime.date(2021, 1, 15),
       datetime.date(2021, 1, 14), datetime.date(2021, 1, 31),
       datetime.date(2021, 1, 27), datetime.date(2021, 1, 19),
       datetime.date(2021, 1, 7), datetime.date(2021, 1, 6),
       datetime.date(2021, 1, 17), datetime.date(2021, 1, 21),
       datetime.date(2021, 1, 10), datetime.date(2021, 1, 16),
       datetime.date(2021, 1, 9), datetime.date(2021, 1, 2),
       datetime.date(2021, 1, 29), datetime.date(2021, 1, 18),
       datetime.date(2021, 1, 26), datetime.date(2021, 1, 4),
       datetime.date(2021, 1, 23), datetime.date(2021, 1, 24),
       datetime.date(2021, 1, 13), datetime.date(2021, 1, 22),
       datetime.date(2021, 1, 8), datetime.date(2021, 1, 20),
       datetime.date(2021, 1, 5), datetime.date(2021, 1, 30),
       datetime.date(2021, 1, 28), datetime.date(2021, 1, 1),
       datetime.date(2021, 1, 3), datetime.date(2020, 12, 31)],
      dtype=object)


data_2021[(data_2021['pickup_date'] <= "2020-12-31") 
     | (data_2021['pickup_date'] > "2021-01-31")].head()


dropIndex = list(data_2021[(data_2021['pickup_date'] <= "2020-12-31") 
     | (data_2021['pickup_date'] > "2021-01-31")].index)


dropIndex[:5], len(dropIndex)

([3783], 1)


data_2021.drop([i for i in dropIndex], inplace=True)


pd.to_datetime(data_2021['tpep_pickup_datetime']).dt.date.unique()

array([datetime.date(2021, 1, 12), datetime.date(2021, 1, 11),
       datetime.date(2021, 1, 25), datetime.date(2021, 1, 15),
       datetime.date(2021, 1, 14), datetime.date(2021, 1, 31),
       datetime.date(2021, 1, 27), datetime.date(2021, 1, 19),
       datetime.date(2021, 1, 7), datetime.date(2021, 1, 6),
       datetime.date(2021, 1, 17), datetime.date(2021, 1, 21),
       datetime.date(2021, 1, 10), datetime.date(2021, 1, 16),
       datetime.date(2021, 1, 9), datetime.date(2021, 1, 2),
       datetime.date(2021, 1, 29), datetime.date(2021, 1, 18),
       datetime.date(2021, 1, 26), datetime.date(2021, 1, 4),
       datetime.date(2021, 1, 23), datetime.date(2021, 1, 24),
       datetime.date(2021, 1, 13), datetime.date(2021, 1, 22),
       datetime.date(2021, 1, 8), datetime.date(2021, 1, 20),
       datetime.date(2021, 1, 5), datetime.date(2021, 1, 30),
       datetime.date(2021, 1, 28), datetime.date(2021, 1, 1),
       datetime.date(2021, 1, 3)], dtype=object)


fig = plt.figure(figsize=(15,10))

record_2021_df = data_2021.value_counts('pickup_date').sort_index()
bar_chart = plt.bar(x =record_2021_df.index, 
                    height = record_2021_df.values)
bar_chart.colors = ["blue"]

plt.xlabel("Date")
plt.ylabel("Trip Count")
plt.title("Trip Count in Jan. 2021")

plt.show()


record_2021_df.min(), record_2021_df.max()

(252, 621)


record_2021_df = np.log10(record_2021_df)
record_2021_df.head()

pickup_date
2021-01-01    2.413300
2021-01-02    2.513218
2021-01-03    2.401401
2021-01-04    2.630428
2021-01-05    2.700704
dtype: float64


data_grouped = pd.DataFrame()

data_grouped['date'] = record_2021_df.index.day
# data_grouped['record_2019'] = record_2019_df.values
# data_grouped['record_2020'] = record_2020_df.values
data_grouped['record_2021'] = record_2021_df.values

data_grouped.head()


avg_record_2021_df = data_2021.value_counts('pickup_date').sort_index() / 13587
avg_record_2021_df.head()

pickup_date
2021-01-01    0.019062
2021-01-02    0.023994
2021-01-03    0.018547
2021-01-04    0.031427
2021-01-05    0.036947
dtype: float64


avg_record_2021 = avg_record_2021_df.sum() / len(avg_record_2021_df)
avg_record_2021

0.03251685078478717


total_amount_2021 = data_2021.groupby('pickup_date')['total_amount'].sum()
total_amount_2021 = np.log10(total_amount_2021)
total_amount_2021.head()

pickup_date
2021-01-01    3.685942
2021-01-02    3.791477
2021-01-03    3.692340
2021-01-04    3.899154
2021-01-05    3.968957
Name: total_amount, dtype: float64


data_2021['passenger_count'].unique()

array([ 1., nan,  2.,  5.,  0.,  3.,  6.,  4.])


data_2021[data_2021['passenger_count'] == 0].head()


dropIndex = list(data_2021[data_2021['passenger_count'] == 0].index)
dropIndex[:5], len(dropIndex)

([931595, 1001934, 19503, 361608, 635146], 285)


data_2021.drop([i for i in dropIndex], inplace=True)


data_2021 = data_2021[~pd.isnull(data_2021['passenger_count'])]


data_2021['passenger_count'].unique()

array([1., 2., 5., 3., 6., 4.])


pass_count_2021 = data_2021.groupby('pickup_date')['passenger_count'].sum()
pass_count_2021.head()

pickup_date
2021-01-01    379.0
2021-01-02    414.0
2021-01-03    361.0
2021-01-04    507.0
2021-01-05    680.0
Name: passenger_count, dtype: float64


myIndividuaLSelectedLabel = ipywidgets.Label()


ntrip_distance = 20
ntotal_payment = 20


Itrip_bins = np.linspace(1.26125, 48.80875, ntrip_distance+1)
Ipay_bins = np.linspace(6.9375, 259.1625, ntotal_payment+1)


Ihist2d, Itrip_edges, Ipay_edges = np.histogram2d(data_2021['trip_distance'],
                                                 data_2021['total_amount'],
                                                 weights=data_2021['passenger_count'],
                                                 bins = [Itrip_bins, Ipay_bins])


Itrip_centers = (Itrip_edges[:-1] + Itrip_edges[1:]) / 2
Ipay_centers = (Ipay_edges[:-1] + Ipay_edges[1:]) / 2


Itripmin = Itrip_centers.min()
Itripmax = Itrip_centers.max()
Ipaymin = Ipay_centers.min()
Ipaymax = Ipay_centers.max()


def individual_generate_histogram_from_trip_pay(data, ntrip=20, npay=20,
                                               tripmin=Itripmin, tripmax=Itripmax,
                                               paymin=Ipaymin,paymax=Ipaymax,
                                                takeLog=True):
    trip_bins = np.linspace(tripmin, tripmax, ntrip+1)
    pay_bins = np.linspace(paymin, paymax, npay+1)
    hist2d, trip_edges, pay_edges = np.histogram2d(data['trip_distance'],
                                                  data['total_amount'],
                                                  weights=data['passenger_count'],
                                                  bins = [trip_bins, pay_bins])
    hist2d = hist2d.T
    if takeLog:
        hist2d[hist2d <= 0] = np.nan # set zeros to NaNs
        # then take log
        hist2d = np.log10(hist2d)
        trip_centers = (trip_edges[:-1] + trip_edges[1:]) / 2
        pay_centers = (pay_edges[:-1] + pay_edges[1:]) / 2
    return hist2d, trip_centers, pay_centers, trip_edges, pay_edges


Ihist2d, Itrip_centers, Ipay_centers, Itrip_edges, Ipay_edges = individual_generate_histogram_from_trip_pay(data_2021)


# Scale
col_sc = bqplot.ColorScale(scheme="RdPu",
                          min=np.nanmin(Ihist2d),
                          max=np.nanmax(Ihist2d))
x_sc = bqplot.LinearScale()
y_sc = bqplot.LinearScale()

# Axis
c_ax = bqplot.ColorAxis(scale = col_sc,
                       orientation='vertical',
                       side='right')
x_ax = bqplot.Axis(scale = x_sc, label='Trip Distance')
y_ax = bqplot.Axis(scale = y_sc, label='Total Payment',
                  orientation='vertical', label_offset="45px")


# Marks
Iheat_map = bqplot.GridHeatMap(color = Ihist2d,
                              row = Ipay_centers,
                              column = Itrip_centers,
                              scales = {'color':col_sc,
                                       'row': y_sc, 
                                       'column':x_sc},
                              interactions = {'click':'select'},
                              anchor_style = {'fill':'blue'},
                              selected_style = {'opacity':1.00},
                              unselected_style = {'opacity':1.00})


# Scale
x_scl = bqplot.DateScale()
y_scl = bqplot.LogScale()

# Axis
ax_xcl = bqplot.Axis(label='Date', scale = x_scl)
ax_ycl = bqplot.Axis(label = 'Passenger Count', scale = y_scl, 
                    orientation = 'vertical', side = 'left')

# Marks
i,j = 19, 0
Itrips = [Itrip_edges[j], Itrip_edges[j+1]]
Ipays = [Ipay_edges[i], Ipay_edges[i+1]]

# region mask
region_mask = ((data_2021['total_amount'] >= Ipays[0]) & (data_2021['total_amount']<=Ipays[1]) &\
                    (data_2021['trip_distance'] >= Itrips[0]) & (data_2021['trip_distance']<=Itrips[1]))

# Fig
Ipass_scatt = bqplot.Scatter(x=data_2021['pickup_date'][region_mask],
                            y=data_2021['passenger_count'][region_mask],
                            scales = {'x':x_scl, 'y':y_scl})
# data_2021['pickup_date'][region_mask]
# data_2021['passenger_count'][region_mask]


def get_individual_data_value(change):
    if len(change['owner'].selected) == 1:
        i, j = change['owner'].selected[0]
        v = Ihist2d[i, j]
        myIndividuaLSelectedLabel.value = "Passenger Count in Log = " + str(v)
        Itrips = [Itrip_edges[j], Itrip_edges[j+1]]
        Ipays = [Ipay_edges[i], Ipay_edges[i+1]]
        region_mask = ((data_2021['total_amount'] >= Ipays[0]) & (data_2021['total_amount']<=Ipays[1]) &\
                    (data_2021['trip_distance'] >= Itrips[0]) & (data_2021['trip_distance']<=Itrips[1]))
        Ipass_scatt.x = data_2021['pickup_date'][region_mask]
        Ipass_scatt.y = data_2021['passenger_count'][region_mask]
        
Iheat_map.observe(get_individual_data_value, 'selected')


def get_test(change):
    if len(change['owner'].selected) == 1:
        i, j = change['owner'].selected[0]
        print(i, j)
        v = Ihist2d[i, j]
        print(v, v.dtype)
# Iheat_map.observe(get_test, 'selected')


fig_Iheatmap = bqplot.Figure(marks = [Iheat_map], axes = [c_ax, y_ax, x_ax])
fig_Ipass = bqplot.Figure(marks = [Ipass_scatt], axes = [ax_xcl, ax_ycl])


fig_Iheatmap.layout.min_width='500px'
fig_Ipass.layout.min_width='500px'

myDashboard = ipywidgets.VBox([myIndividuaLSelectedLabel, ipywidgets.HBox([fig_Iheatmap,fig_Ipass])])
# myDashboard


x_column = ['payment_type']
y_column = ['total_amount', 'tolls_amount']


# I looked at the tutorial for reference and hints of this graph
# https://www.tutorialspoint.com/python-matplotlib-multiple-bars
plt.rcParams["figure.figsize"] = [15, 10]
plt.rcParams["figure.autolayout"] = True
plt.rc('axes', titlesize=30)
plt.rc('axes', labelsize=25)
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)

labels = data_grouped['date']
# record_2019 = record_2019_df.values
# record_2020 = record_2020_df.values
record_2021 = record_2021_df.values

x = np.arange(len(labels))
width = 0.7

fig, ax = plt.subplots()
# rects_2019 = ax.bar(x  - width, record_2019, width, label = "2019")
# rects_2020 = ax.bar(x, record_2020, width, label = "2020")
rects_2021 = ax.bar(x, record_2021, width, label = "2021")

ax.set_xlabel("Date in January")
ax.set_ylabel('Taxi Drives in Log')
ax.set_title('Taxi Drives in January, 2021')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = round(rect.get_height(), 2)
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3), # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    
autolabel(rects_2021)

# trip_plt = plt

plt.show()


avg_record_2021

0.03251685078478717


plt.rcParams["figure.figsize"] = [15, 10]
plt.rcParams["figure.autolayout"] = True
plt.rc('axes', titlesize=30)
plt.rc('axes', labelsize=25)
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)

labels = data_grouped['date']
# record_2019 = total_amount_2019.values
# record_2020 = total_amount_2020.values
record_2021 = total_amount_2021.values

x = np.arange(len(labels))
width = 0.7

fig, ax = plt.subplots()
# rects_2019 = ax.bar(x  - width, record_2019, width, label = "2019")
# rects_2020 = ax.bar(x, record_2020, width, label = "2020")
rects_2021 = ax.bar(x, record_2021, width, label = "2021")

ax.set_xlabel("Date in January")
ax.set_ylabel('Total Payment')
ax.set_title('Total Payment in January, 2019 - 2021')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = round(rect.get_height(), 2)
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3), # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects_2021)

# pay_plt = plt

plt.show()


# I looked at the tutorial for reference and hints of this graph
# https://www.tutorialspoint.com/python-matplotlib-multiple-bars
plt.rcParams["figure.figsize"] = [15, 10]
plt.rcParams["figure.autolayout"] = True
plt.rc('axes', titlesize=30)
plt.rc('axes', labelsize=25)
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)

labels = data_grouped['date']
# record_2019 = record_2019_df.values
# record_2020 = record_2020_df.values
record_2021 = np.log10(pass_count_2021)

x = np.arange(len(labels))
width = 0.7

fig, ax = plt.subplots()
# rects_2019 = ax.bar(x  - width, record_2019, width, label = "2019")
# rects_2020 = ax.bar(x, record_2020, width, label = "2020")
rects_2021 = ax.bar(x, record_2021, width, label = "2021")

ax.set_xlabel("Date in January")
ax.set_ylabel('Passenger Count in Log')
ax.set_title('Passenger Count in January, 2021')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = round(rect.get_height(), 2)
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3), # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    
autolabel(rects_2021)

# pass_plt = plt

plt.show()


myDashboard

VBox(children=(Label(value=''), HBox(children=(Figure(axes=[ColorAxis(orientation='vertical', scale=ColorScale…


@ipywidgets.interact(style=plt.style.available, colormap_name=plt.colormaps(), x=x_column, y=y_column)
def payment_scatter(style, colormap_name, x, y):
    with plt.style.context(style):
        colorScale = data_2021['tip_amount']
        plt.scatter(data_2021[x], data_2021[y], c=colorScale, cmap=colormap_name)
        plt.xlabel(x)
        plt.ylabel(y)
        plt.title(x + " vs " + y)
        plt.show()

interactive(children=(Dropdown(description='style', options=('Solarize_Light2', '_classic_test_patch', 'bmh', …

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge
411971	1.0	2021-01-12 06:23:14	2021-01-12 06:34:16	1.0	2.30	1.0	N	226	145	2.0	10.0	0.0	0.5	0.00	0.3	10.80	0.0
406278	1.0	2021-01-11 19:42:05	2021-01-11 19:48:14	1.0	1.90	1.0	N	237	137	1.0	7.5	3.5	0.5	2.35	0.3	14.15	2.5
989763	2.0	2021-01-25 14:59:49	2021-01-25 15:05:53	1.0	1.20	1.0	N	79	137	1.0	6.5	0.0	0.5	1.96	0.3	11.76	2.5
580385	2.0	2021-01-15 15:41:45	2021-01-15 15:44:18	1.0	0.43	1.0	N	237	236	2.0	4.0	0.0	0.5	0.00	0.3	7.30	2.5
542365	2.0	2021-01-14 18:47:27	2021-01-14 18:54:50	1.0	1.27	1.0	N	234	211	1.0	7.0	1.0	0.5	1.00	0.3	12.30	2.5

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge	pickup_date
931595	1.0	2021-01-23 20:01:34	2021-01-23 20:16:35	4.1	1.0	N	263	48	2.0	14.50	2.5	0.5	0.00	0.3	17.80	2.5	2021-01-23
1001934	1.0	2021-01-25 18:15:04	2021-01-25 18:15:46	0.2	1.0	N	264	264	1.0	2.50	1.0	0.5	5.00	0.3	9.30	0.0	2021-01-25
19503	2.0	2021-01-01 19:23:41	2021-01-01 19:23:48	0.0	5.0	N	116	116	1.0	31.62	0.0	0.5	0.01	0.3	32.43	0.0	2021-01-01
361608	1.0	2021-01-10 18:13:12	2021-01-10 18:21:14	1.2	1.0	N	151	41	1.0	7.50	0.0	0.5	1.00	0.3	9.30	0.0	2021-01-10
635146	1.0	2021-01-16 21:39:30	2021-01-16 22:04:07	5.1	1.0	N	230	42	1.0	20.00	3.0	0.5	4.75	0.3	28.55	2.5	2021-01-16

Rocky Wu - jywu3¶

Final Project - Code Section¶

Explore the Dataset:¶

Columns Explained:¶

Take a look of the dataset¶

Explore the values inside each column¶

Data Cleaning¶

Create helper function for the dashboard¶

Heat map¶

Scatter plot¶

Create x, y column for another interactive plt¶

Final Project Part 3 - Write Up¶

Data Visualization - TLC Yellow Taxi Simple Investigation¶

TLC DATA AT JANUARY 2021 DEEP DIVE¶

So, what is Yellow Taxi in NYC?¶

Digging a little deeper¶

Total amount¶

Passenger Count¶

Another interactive plot for Part 3¶

End of Part 3¶