# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb


%matplotlib inline

df = pd.read_csv('201902-fordgobike-tripdata.csv')
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183412 entries, 0 to 183411
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration_sec             183412 non-null  int64  
 1   start_time               183412 non-null  object 
 2   end_time                 183412 non-null  object 
 3   start_station_id         183215 non-null  float64
 4   start_station_name       183215 non-null  object 
 5   start_station_latitude   183412 non-null  float64
 6   start_station_longitude  183412 non-null  float64
 7   end_station_id           183215 non-null  float64
 8   end_station_name         183215 non-null  object 
 9   end_station_latitude     183412 non-null  float64
 10  end_station_longitude    183412 non-null  float64
 11  bike_id                  183412 non-null  int64  
 12  user_type                183412 non-null  object 
 13  member_birth_year        175147 non-null  float64
 14  member_gender            175147 non-null  object 
 15  bike_share_for_all_trip  183412 non-null  object 
dtypes: float64(7), int64(2), object(7)
memory usage: 22.4+ MB

sum(df.duplicated())

0

df.isnull().sum()

duration_sec                  0
start_time                    0
end_time                      0
start_station_id            197
start_station_name          197
start_station_latitude        0
start_station_longitude       0
end_station_id              197
end_station_name            197
end_station_latitude          0
end_station_longitude         0
bike_id                       0
user_type                     0
member_birth_year          8265
member_gender              8265
bike_share_for_all_trip       0
dtype: int64

df.describe()

df.query("member_birth_year < 1934")

# make a copy
df_clean = df.copy()

# dropping rows with null values
df_clean.dropna(inplace=True)

df_clean.reset_index(drop=True, inplace = True)

# Display DataFrame info
df_clean.info()

# Check for null counts
null_counts = df_clean.isnull().sum()
print(null_counts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174952 entries, 0 to 174951
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration_sec             174952 non-null  int64  
 1   start_time               174952 non-null  object 
 2   end_time                 174952 non-null  object 
 3   start_station_id         174952 non-null  float64
 4   start_station_name       174952 non-null  object 
 5   start_station_latitude   174952 non-null  float64
 6   start_station_longitude  174952 non-null  float64
 7   end_station_id           174952 non-null  float64
 8   end_station_name         174952 non-null  object 
 9   end_station_latitude     174952 non-null  float64
 10  end_station_longitude    174952 non-null  float64
 11  bike_id                  174952 non-null  int64  
 12  user_type                174952 non-null  object 
 13  member_birth_year        174952 non-null  float64
 14  member_gender            174952 non-null  object 
 15  bike_share_for_all_trip  174952 non-null  object 
dtypes: float64(7), int64(2), object(7)
memory usage: 21.4+ MB
duration_sec               0
start_time                 0
end_time                   0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
member_birth_year          0
member_gender              0
bike_share_for_all_trip    0
dtype: int64

# Define a dictionary of columns and their desired data types
dtype_mapping = {
    'start_time': 'datetime64[ns]',
    'end_time': 'datetime64[ns]',
    'member_birth_year': 'int',
    'bike_id': 'object',
    'start_station_id': 'object',
    'end_station_id': 'object',
    'user_type': 'category',
    'member_gender': 'category',
    'bike_share_for_all_trip': 'category'
}

# Apply the conversions
for column, dtype in dtype_mapping.items():
    df_clean[column] = df_clean[column].astype(dtype)

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174952 entries, 0 to 174951
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             174952 non-null  int64         
 1   start_time               174952 non-null  datetime64[ns]
 2   end_time                 174952 non-null  datetime64[ns]
 3   start_station_id         174952 non-null  object        
 4   start_station_name       174952 non-null  object        
 5   start_station_latitude   174952 non-null  float64       
 6   start_station_longitude  174952 non-null  float64       
 7   end_station_id           174952 non-null  object        
 8   end_station_name         174952 non-null  object        
 9   end_station_latitude     174952 non-null  float64       
 10  end_station_longitude    174952 non-null  float64       
 11  bike_id                  174952 non-null  object        
 12  user_type                174952 non-null  category      
 13  member_birth_year        174952 non-null  int64         
 14  member_gender            174952 non-null  category      
 15  bike_share_for_all_trip  174952 non-null  category      
dtypes: category(3), datetime64[ns](2), float64(4), int64(2), object(5)
memory usage: 17.9+ MB

df_clean.member_birth_year.describe()

count    174952.000000
mean       1984.803135
std          10.118731
min        1878.000000
25%        1980.000000
50%        1987.000000
75%        1992.000000
max        2001.000000
Name: member_birth_year, dtype: float64

bin_edges = np.arange (0, df_clean['member_birth_year'].max()+5, 5)
plt.hist(data = df_clean, x = 'member_birth_year', bins = bin_edges)
plt.xlim(1880, 2000)
plt.xlabel('Member Birth Year')
plt.ylabel('Frequency');

# look for outliers
sb.boxplot(x=df_clean['member_birth_year'])

<Axes: xlabel='member_birth_year'>

# Using the Interquartile Range to understand outliers
q1 = df_clean.member_birth_year.quantile(0.25)
q3 = df_clean.member_birth_year.quantile(0.75)
iqr = q3 - q1
print(q1)
print(q3)
print(iqr)

1980.0
1992.0
12.0

lower_whisker = q1 - 1.5 * iqr
print(lower_whisker)

1962.0

df_clean.query("member_birth_year < 1962")

df.member_birth_year.describe(percentiles = [.01])

count    175147.000000
mean       1984.806437
std          10.116689
min        1878.000000
1%         1955.000000
50%        1987.000000
max        2001.000000
Name: member_birth_year, dtype: float64

df_clean.query("member_birth_year < 1955")

df_clean.drop(df_clean[df_clean.member_birth_year < 1955].index, inplace=True)

df_clean.reset_index(drop=True, inplace = True)

df_clean.query("member_birth_year < 1955")

# Display DataFrame info
df_clean.info()

# Check for null counts
null_counts = df_clean.isnull().sum()
print(null_counts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173272 entries, 0 to 173271
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             173272 non-null  int64         
 1   start_time               173272 non-null  datetime64[ns]
 2   end_time                 173272 non-null  datetime64[ns]
 3   start_station_id         173272 non-null  object        
 4   start_station_name       173272 non-null  object        
 5   start_station_latitude   173272 non-null  float64       
 6   start_station_longitude  173272 non-null  float64       
 7   end_station_id           173272 non-null  object        
 8   end_station_name         173272 non-null  object        
 9   end_station_latitude     173272 non-null  float64       
 10  end_station_longitude    173272 non-null  float64       
 11  bike_id                  173272 non-null  object        
 12  user_type                173272 non-null  category      
 13  member_birth_year        173272 non-null  int64         
 14  member_gender            173272 non-null  category      
 15  bike_share_for_all_trip  173272 non-null  category      
dtypes: category(3), datetime64[ns](2), float64(4), int64(2), object(5)
memory usage: 17.7+ MB
duration_sec               0
start_time                 0
end_time                   0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
member_birth_year          0
member_gender              0
bike_share_for_all_trip    0
dtype: int64

df_clean.describe()

bin_edges = np.arange (0, df_clean['duration_sec'].max()+100, 100)
plt.hist(data = df_clean, x = 'duration_sec', bins = bin_edges)
plt.xlim(0, 3000)
plt.xlabel('Trip Duration (seconds)')
plt.ylabel('Frequency');

# gauging bin limits are appropriate for the plot
df_clean['duration_sec'].describe()

count    173272.000000
mean        703.878549
std        1647.305625
min          61.000000
25%         323.000000
50%         510.000000
75%         788.000000
max       84548.000000
Name: duration_sec, dtype: float64

bin_edges = 10 ** np.arange(1, 5 + 0.1, 0.1)
ticks = [10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000]
plt.hist(data = df_clean, x = 'duration_sec', bins=bin_edges)
plt.xscale('log')
plt.xticks(ticks, ticks)
plt.xlabel('Trip Duration (seconds)')
plt.ylabel('Frequency');

# create a duration column in minutes
df_clean['duration_min'] = df_clean['duration_sec'] / 60

bin_edges = np.arange (0, df_clean['duration_min'].max()+5, 5)
plt.hist(data = df_clean, x = 'duration_min', bins = bin_edges)
plt.xlim(0, 100)
plt.xlabel('Trip Duration (minutes)')
plt.ylabel('Frequency');

bin_edges = 10 ** np.arange(0, 3 + 0.1, 0.1)
ticks = [1, 3, 10, 30, 100, 300, 1000]
plt.hist(data = df_clean, x = 'duration_min', bins=bin_edges)
plt.xscale('log')
plt.xticks(ticks, ticks)
plt.xlabel('Trip Duration (minutes)')
plt.ylabel('Frequency');

# extract the start hour of the trip from the start time column
df_clean['start_hour'] = df_clean['start_time'].dt.strftime('%H')

df_clean['start_hour'] = df_clean['start_hour'].astype(int)

# test
df_clean.head()

# plot the frequency of bike rides by hour of the day
plt.figure(figsize=[10,6])
base_color = sb.color_palette()[0]
sb.countplot(data = df_clean, x = 'start_hour', color = base_color)
plt.xlabel('Start Hour')
plt.title('Frequency of Bike Rides by Hour of the Day')

Text(0.5, 1.0, 'Frequency of Bike Rides by Hour of the Day')

# extracting the day of the week from the start_time column
df_clean['day'] = df_clean['start_time'].dt.strftime('%A')

df_clean

# order the days of the week
df_clean['day'] = pd.Categorical(df_clean['day'], categories= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],ordered=True)

# plotting the frequency of bike rides by the day of the week
plt.figure(figsize=[10,6])
base_color = sb.color_palette()[0]
sb.countplot(data = df_clean, x = 'day', color = base_color)
plt.xlabel('Start Day')
plt.title('Frequency of Bike Rides by Day of the Week')
plt.xticks(rotation=15);

# frequency of bike rides by user type
base_color = sb.color_palette()[0]
user_order = df_clean['user_type'].value_counts().index
sb.countplot(data = df_clean, x = 'user_type', color = base_color, order = user_order)
plt.xlabel('User')
plt.title('Frequency of Bike Rides by User Type');

#creating age column from member birth year (2019 - member birth year since this data is from 2019)
df_clean['age'] = (2019 - df_clean['member_birth_year'])

# checking this worked
df_clean

# plotting age distribution 
bin_edges = np.arange (10, df_clean['age'].max()+2, 2)
plt.hist(data = df_clean, x = 'age', bins = bin_edges)
plt.xlim(10, 70)
plt.xlabel('Member Age (years)')
plt.ylabel('Frequency');

sb.violinplot(data = df_clean, y = 'age')
plt.ylabel('Member Age (years)');

# plotting gender 
base_color = sb.color_palette()[0]
gender_order = df_clean['member_gender'].value_counts().index
sb.countplot(data = df_clean, x = 'member_gender', color = base_color, order = gender_order)
plt.xlabel('Gender')
plt.title('Frequency of Bike Rides by Gender');

base_color = sb.color_palette()[0]
share_order = df_clean['bike_share_for_all_trip'].value_counts().index
sb.countplot(data = df_clean, x = 'bike_share_for_all_trip', color = base_color, order = share_order)
plt.xlabel('Bike Share for All')
plt.title('Frequency of Bike Share for All Trips');

def haversine_np(start_station_longitude, start_station_latitude, end_station_longitude, end_station_latitude):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees).
    All args must be of equal length.    
    """
    # Convert decimal degrees to radians
    start_station_longitude, start_station_latitude, end_station_longitude, end_station_latitude = map(np.radians, 
        [start_station_longitude, start_station_latitude, end_station_longitude, end_station_latitude])
    
    # Differences in coordinates
    dlon = end_station_longitude - start_station_longitude
    dlat = end_station_latitude - start_station_latitude
    
    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(start_station_latitude) * np.cos(end_station_latitude) * np.sin(dlon/2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))  # Using arctan2 for stability
    
    # Radius of the Earth in kilometers
    radius_earth_km = 6371.0
    km = radius_earth_km * c
    
    return km

# creating a distance column 
df_clean['distance'] = haversine_np(df_clean['start_station_longitude'],df_clean['start_station_latitude'],df_clean['end_station_longitude'],df_clean['end_station_latitude'])

# checking that this worked 
df_clean

df_clean.distance.describe()

count    173272.000000
mean          1.691558
std           1.096204
min           0.000000
25%           0.910955
50%           1.430675
75%           2.225687
max          69.469241
Name: distance, dtype: float64

df_clean.query("distance > 20")

df_clean = df_clean.drop(df_clean[df_clean.distance > 20].index)

df_clean.distance.describe()

count    173271.000000
mean          1.691167
std           1.084047
min           0.000000
25%           0.910955
50%           1.430675
75%           2.225687
max          15.673955
Name: distance, dtype: float64

# plotting distance distribution
bin_edges = np.arange (0, df_clean['distance'].max()+0.5, 0.5)
plt.hist(data = df_clean, x = 'distance', bins = bin_edges)
plt.xlabel('Distance (km)')
plt.xlim(0,10)
plt.ylabel('Frequency');

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 173271 entries, 0 to 173271
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             173271 non-null  int64         
 1   start_time               173271 non-null  datetime64[ns]
 2   end_time                 173271 non-null  datetime64[ns]
 3   start_station_id         173271 non-null  object        
 4   start_station_name       173271 non-null  object        
 5   start_station_latitude   173271 non-null  float64       
 6   start_station_longitude  173271 non-null  float64       
 7   end_station_id           173271 non-null  object        
 8   end_station_name         173271 non-null  object        
 9   end_station_latitude     173271 non-null  float64       
 10  end_station_longitude    173271 non-null  float64       
 11  bike_id                  173271 non-null  object        
 12  user_type                173271 non-null  category      
 13  member_birth_year        173271 non-null  int64         
 14  member_gender            173271 non-null  category      
 15  bike_share_for_all_trip  173271 non-null  category      
 16  duration_min             173271 non-null  float64       
 17  start_hour               173271 non-null  int64         
 18  day                      173271 non-null  category      
 19  age                      173271 non-null  int64         
 20  distance                 173271 non-null  float64       
dtypes: category(4), datetime64[ns](2), float64(6), int64(4), object(5)
memory usage: 24.5+ MB

# assigning numeric and categoric variables
numeric_vars = ['duration_min', 'age', 'distance']
categoric_vars = ['member_gender', 'user_type', 'bike_share_for_all_trip', 'day', 'start_hour']

g = sb.PairGrid(data = df_clean, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter, alpha=0.3)

<seaborn.axisgrid.PairGrid at 0x15bbf9dd0>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    # Call boxplot with explicit parameters
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='duration_min', x_vars=['member_gender', 'user_type', 'bike_share_for_all_trip'],
                height=3, aspect=1.5)
plt.ylim(0, 50)
g.map(boxgrid)
plt.show()

<Figure size 1600x1600 with 0 Axes>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    # Call boxplot with explicit parameters
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='duration_min', x_vars=['day', 'start_hour'],
                height=3, aspect=1.5)
plt.ylim(0, 50)

g.map(boxgrid)

# Rotate x-axis labels properly
for ax in g.axes.flatten():
    # Get current tick positions
    ticks = ax.get_xticks()
    ax.set_xticks(ticks)  # Ensure ticks are set
    ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha='right')

plt.show();

<Figure size 1600x1600 with 0 Axes>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    # Call boxplot with explicit parameters
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='distance', x_vars=['member_gender', 'user_type' ,'bike_share_for_all_trip'],
                height=3, aspect=1.5)
plt.ylim(0, 5)
g.map(boxgrid)
plt.show()

<Figure size 1600x1600 with 0 Axes>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='distance', x_vars=['day', 'start_hour'], height=3, aspect=1.5)

plt.ylim(0, 5)
g.map(boxgrid)

for ax in g.axes.flat:
    # Get the current x-ticks
    ticks = ax.get_xticks()
    # Set the new labels for the current ticks
    ax.set_xticks(ticks)  # Ensure ticks are set before labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha='right')

plt.show();

<Figure size 1600x1600 with 0 Axes>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    # Call boxplot with explicit parameters
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='age', x_vars=['member_gender', 'user_type', 'bike_share_for_all_trip'],
                height=3, aspect=1.5)

g.map(boxgrid)
plt.show()

<Figure size 1600x1600 with 0 Axes>

def boxgrid(x, y, **kwargs):
    default_color = sb.color_palette()[0]
    # Call boxplot with explicit parameters
    sb.boxplot(x=x, y=y, color=default_color, **{k: v for k, v in kwargs.items() if k != 'color'})
    

    plt.xticks(rotation=25) 

plt.figure(figsize=[16, 16])
g = sb.PairGrid(data=df_clean, y_vars='age', x_vars=['day', 'start_hour'],
                height=3, aspect=1.5)

g.map(boxgrid)
plt.show()

<Figure size 1600x1600 with 0 Axes>

#clustered bar chart
ax = sb.countplot(data = df_clean, x = 'user_type', hue = 'member_gender', hue_order=['Male', 'Female', 'Other'])
ax.legend(loc = 2, framealpha = 1)

<matplotlib.legend.Legend at 0x15ce5c150>

g = sb.PairGrid(data=df_clean, y_vars='duration_min', x_vars=['member_gender', 'user_type', 'day'])
plt.ylim(0, 50)
rotation = 30


for ax in g.fig.axes:
    ax.tick_params(axis='x', rotation=rotation)


default_blue = sb.color_palette()[0]


g.map(sb.violinplot, inner='quartile', color=default_blue)


plt.show()

fig = plt.figure(figsize=(12, 8))  # Adjust width and height as needed
g = sb.PairGrid(data=df_clean, y_vars='duration_min', x_vars=['member_gender', 'user_type', 'day'], height=4)  # You can set height per subplot

plt.ylim(0, 50)
rotation = 30


for ax in g.fig.axes:
    ax.tick_params(axis='x', rotation=rotation)


default_blue = sb.color_palette()[0]


g.map(sb.boxplot, color=default_blue)


plt.show()

<Figure size 1200x800 with 0 Axes>

# clustered bar chart
default_colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, orange, green
user_type_order = ['Subscriber', 'Customer']
plt.figure(figsize=[10, 6])
ax = sb.barplot(data=df_clean, x='user_type', y='duration_min', hue='member_gender', hue_order=['Male', 'Female', 'Other'], palette=default_colors, order=user_type_order)
ax.legend(loc=2, ncol=1, framealpha=1, title='member_gender')

plt.show()

# clustered bar chart
plt.figure(figsize = [12, 8])
ax = sb.barplot(data = df_clean, x = 'day', y = 'duration_min', hue = 'member_gender', hue_order=['Male', 'Female', 'Other'], palette=default_colors)
ax.legend(loc = 2, ncol = 2, framealpha = 1, title = 'member_gender')

<matplotlib.legend.Legend at 0x15cf82690>

# clustered bar chart
plt.figure(figsize = [12, 8])
ax = sb.barplot(data = df_clean, x = 'day', y = 'duration_min', hue = 'user_type', palette=default_colors[:2])
ax.legend(loc = 2, ncol = 2, framealpha = 1, title = 'user type')

<matplotlib.legend.Legend at 0x159917610>

# Define the order for the days
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Convert the 'day' column to a categorical type with the specified order
df_clean['day'] = pd.Categorical(df_clean['day'], categories=day_order, ordered=True)

# Create a FacetGrid
g = sb.FacetGrid(df_clean, col='user_type', col_wrap=2, height=4, aspect=1.5)

# Create the bar plot within each facet
g.map_dataframe(sb.barplot, x='day', y='duration_min', hue='user_type', palette=default_colors[:2], order=day_order)

# Add legends and titles
g.add_legend(title='User Type', bbox_to_anchor=(1.05, 1), loc='upper left')
g.set_axis_labels('Day', 'Duration (minutes)')
g.set_titles(col_template='{col_name}')

# Show the plot
plt.tight_layout()
plt.show()

# Generate a second dataframe for visualization
df_clean2 = pd.pivot_table(df_clean[['day', 'start_hour', 'duration_min']], index=['day', 'start_hour'], aggfunc='count')

# Unstacking to achieve the appropriate format for the heatmap.
df_clean3 = df_clean2.unstack(level=0)

# Generate new labels for the hours.
am_hrs = [f"{hr}am" for hr in range(1, 12)]
pm_hrs = [f"{hr}pm" for hr in range(1, 12)]
complete_hrs = ["12am"] + am_hrs + ["12pm"] + pm_hrs

# Abbreviated names for the days of the week
day_abbr = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']

# Create the heatmap
sb.set_context("talk")
f, ax = plt.subplots(figsize=(11, 15))
ax = sb.heatmap(df_clean3, annot=True, fmt="d", linewidths=.5, ax=ax, xticklabels=day_abbr, yticklabels=complete_hrs, cmap="viridis")
ax.axes.set_title("Heatmap of Ride Counts by Day and Hour of Day", fontsize=24, y=1.01)
ax.set(xlabel='Day of Week', ylabel='Starting Hour of Ride');



plt.show()

plt.figure(figsize=[12, 12])
cat_means = df_clean.groupby(['day', 'start_hour'], observed=False).mean(numeric_only=True)['duration_min']
cat_means = cat_means.reset_index(name='duration_min_avg')
cat_means = cat_means.pivot(index='start_hour', columns='day', values='duration_min_avg')
sb.heatmap(cat_means, annot=True, fmt='.3f',
           cbar_kws={'label': 'Average Duration of Bike Trip (minutes)'}, xticklabels=day_abbr, yticklabels=complete_hrs, cmap="viridis_r")
plt.xlabel("Day of the Week")
plt.ylabel("Starting Hour of the Bike Ride")
plt.title("Heatmap of Ride Duration by Day and Hour of Day", fontsize=24, y=1.01);

# save as a csv 
df_clean.to_csv('2019-fordgobike-data-clean.csv', index = False)

	duration_sec	start_time	end_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	member_birth_year	member_gender	bike_share_for_all_trip
0	52185	2019-02-28 17:32:10.1450	2019-03-01 08:01:55.9750	21.0	Montgomery St BART Station (Market St at 2nd St)	37.789625	-122.400811	13.0	Commercial St at Montgomery St	37.794231	-122.402923	4902	Customer	1984.0	Male	No
1	42521	2019-02-28 18:53:21.7890	2019-03-01 06:42:03.0560	23.0	The Embarcadero at Steuart St	37.791464	-122.391034	81.0	Berry St at 4th St	37.775880	-122.393170	2535	Customer	NaN	NaN	No
2	61854	2019-02-28 12:13:13.2180	2019-03-01 05:24:08.1460	86.0	Market St at Dolores St	37.769305	-122.426826	3.0	Powell St BART Station (Market St at 4th St)	37.786375	-122.404904	5905	Customer	1972.0	Male	No
3	36490	2019-02-28 17:54:26.0100	2019-03-01 04:02:36.8420	375.0	Grove St at Masonic Ave	37.774836	-122.446546	70.0	Central Ave at Fell St	37.773311	-122.444293	6638	Subscriber	1989.0	Other	No
4	1585	2019-02-28 23:54:18.5490	2019-03-01 00:20:44.0740	7.0	Frank H Ogawa Plaza	37.804562	-122.271738	222.0	10th Ave at E 15th St	37.792714	-122.248780	4898	Subscriber	1974.0	Male	Yes

	duration_sec	start_station_id	start_station_latitude	start_station_longitude	end_station_id	end_station_latitude	end_station_longitude	bike_id	member_birth_year
count	183412.000000	183215.000000	183412.000000	183412.000000	183215.000000	183412.000000	183412.000000	183412.000000	175147.000000
mean	726.078435	138.590427	37.771223	-122.352664	136.249123	37.771427	-122.352250	4472.906375	1984.806437
std	1794.389780	111.778864	0.099581	0.117097	111.515131	0.099490	0.116673	1664.383394	10.116689
min	61.000000	3.000000	37.317298	-122.453704	3.000000	37.317298	-122.453704	11.000000	1878.000000
25%	325.000000	47.000000	37.770083	-122.412408	44.000000	37.770407	-122.411726	3777.000000	1980.000000
50%	514.000000	104.000000	37.780760	-122.398285	100.000000	37.781010	-122.398279	4958.000000	1987.000000
75%	796.000000	239.000000	37.797280	-122.286533	235.000000	37.797320	-122.288045	5502.000000	1992.000000
max	85444.000000	398.000000	37.880222	-121.874119	398.000000	37.880222	-121.874119	6645.000000	2001.000000

	duration_sec	start_time	end_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	member_birth_year	member_gender	bike_share_for_all_trip
1285	148	2019-02-28 19:29:17.6270	2019-02-28 19:31:45.9670	158.0	Shattuck Ave at Telegraph Ave	37.833279	-122.263490	173.0	Shattuck Ave at 55th St	37.840364	-122.264488	5391	Subscriber	1900.0	Male	Yes
5197	217	2019-02-28 13:51:46.2380	2019-02-28 13:55:24.1270	70.0	Central Ave at Fell St	37.773311	-122.444293	71.0	Broderick St at Oak St	37.773063	-122.439078	5801	Subscriber	1931.0	Male	No
5266	384	2019-02-28 13:35:05.4280	2019-02-28 13:41:30.2230	84.0	Duboce Park	37.769200	-122.433812	71.0	Broderick St at Oak St	37.773063	-122.439078	6608	Subscriber	1931.0	Male	No
5447	147	2019-02-28 13:08:56.9350	2019-02-28 13:11:24.0620	84.0	Duboce Park	37.769200	-122.433812	72.0	Page St at Scott St	37.772406	-122.435650	5018	Subscriber	1931.0	Male	No
10827	1315	2019-02-27 19:21:34.4360	2019-02-27 19:43:30.0080	343.0	Bryant St at 2nd St	37.783172	-122.393572	375.0	Grove St at Masonic Ave	37.774836	-122.446546	6249	Subscriber	1900.0	Male	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
177708	1527	2019-02-01 19:09:28.3870	2019-02-01 19:34:55.9630	343.0	Bryant St at 2nd St	37.783172	-122.393572	375.0	Grove St at Masonic Ave	37.774836	-122.446546	5286	Subscriber	1900.0	Male	No
177885	517	2019-02-01 18:38:40.4710	2019-02-01 18:47:18.3920	25.0	Howard St at 2nd St	37.787522	-122.397405	30.0	San Francisco Caltrain (Townsend St at 4th St)	37.776598	-122.395282	2175	Subscriber	1902.0	Female	No
177955	377	2019-02-01 18:23:33.4110	2019-02-01 18:29:50.7950	26.0	1st St at Folsom St	37.787290	-122.394380	321.0	5th St at Folsom	37.780146	-122.403071	5444	Subscriber	1933.0	Female	Yes
182830	428	2019-02-01 07:45:05.9340	2019-02-01 07:52:14.9220	284.0	Yerba Buena Center for the Arts (Howard St at ...	37.784872	-122.400876	67.0	San Francisco Caltrain Station 2 (Townsend St...	37.776639	-122.395526	5031	Subscriber	1901.0	Male	No
183388	490	2019-02-01 00:39:53.1120	2019-02-01 00:48:03.3380	61.0	Howard St at 8th St	37.776513	-122.411306	81.0	Berry St at 4th St	37.775880	-122.393170	5411	Subscriber	1927.0	Male	No

	duration_sec	start_time	end_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	member_birth_year	member_gender	bike_share_for_all_trip
4	1793	2019-02-28 23:49:58.632	2019-03-01 00:19:51.760	93.0	4th St at Mission Bay Blvd S	37.770407	-122.391198	323.0	Broadway at Kearny	37.798014	-122.405950	5200	Subscriber	1959	Male	No
40	116	2019-02-28 23:44:00.988	2019-02-28 23:45:57.482	104.0	4th St at 16th St	37.767045	-122.390833	93.0	4th St at Mission Bay Blvd S	37.770407	-122.391198	823	Subscriber	1959	Male	No
62	681	2019-02-28 23:19:37.366	2019-02-28 23:30:58.862	43.0	San Francisco Public Library (Grove St at Hyde...	37.778768	-122.415929	70.0	Central Ave at Fell St	37.773311	-122.444293	6333	Subscriber	1959	Male	No
196	547	2019-02-28 22:25:51.137	2019-02-28 22:34:58.970	76.0	McCoppin St at Valencia St	37.771662	-122.422423	43.0	San Francisco Public Library (Grove St at Hyde...	37.778768	-122.415929	6333	Subscriber	1961	Female	No
297	217	2019-02-28 21:58:47.639	2019-02-28 22:02:24.693	149.0	Emeryville Town Hall	37.831275	-122.285633	153.0	59th St at Horton St	37.840945	-122.291360	5210	Subscriber	1961	Male	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
174844	459	2019-02-01 05:15:05.178	2019-02-01 05:22:44.272	104.0	4th St at 16th St	37.767045	-122.390833	30.0	San Francisco Caltrain (Townsend St at 4th St)	37.776598	-122.395282	3446	Subscriber	1959	Male	No
174852	373	2019-02-01 04:42:44.709	2019-02-01 04:48:58.076	131.0	22nd St at Dolores St	37.755000	-122.425728	129.0	Harrison St at 20th St	37.758862	-122.412544	5427	Subscriber	1958	Male	No
174853	100	2019-02-01 04:46:54.805	2019-02-01 04:48:34.843	80.0	Townsend St at 5th St	37.775235	-122.397437	67.0	San Francisco Caltrain Station 2 (Townsend St...	37.776639	-122.395526	3138	Subscriber	1950	Male	No
174926	400	2019-02-01 00:46:47.276	2019-02-01 00:53:27.596	220.0	San Pablo Ave at MLK Jr Way	37.811351	-122.273422	337.0	Webster St at 19th St	37.806970	-122.266588	3487	Subscriber	1945	Male	Yes
174929	490	2019-02-01 00:39:53.112	2019-02-01 00:48:03.338	61.0	Howard St at 8th St	37.776513	-122.411306	81.0	Berry St at 4th St	37.775880	-122.393170	5411	Subscriber	1927	Male	No

	duration_sec	start_time	end_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	member_birth_year	member_gender	bike_share_for_all_trip
476	235	2019-02-28 21:17:57.047	2019-02-28 21:21:52.631	34.0	Father Alfred E Boeddeker Park	37.783988	-122.412408	58.0	Market St at 10th St	37.776619	-122.417385	5202	Subscriber	1954	Male	No
956	384	2019-02-28 19:56:45.837	2019-02-28 20:03:10.473	250.0	North Berkeley BART Station	37.873558	-122.283093	257.0	Fifth St at Delaware St	37.870407	-122.299676	1671	Subscriber	1954	Male	No
1033	303	2019-02-28 19:49:38.120	2019-02-28 19:54:42.044	43.0	San Francisco Public Library (Grove St at Hyde...	37.778768	-122.415929	76.0	McCoppin St at Valencia St	37.771662	-122.422423	6333	Subscriber	1945	Male	Yes
1238	148	2019-02-28 19:29:17.627	2019-02-28 19:31:45.967	158.0	Shattuck Ave at Telegraph Ave	37.833279	-122.263490	173.0	Shattuck Ave at 55th St	37.840364	-122.264488	5391	Subscriber	1900	Male	Yes
1295	1362	2019-02-28 19:02:33.643	2019-02-28 19:25:16.561	15.0	San Francisco Ferry Building (Harry Bridges Pl...	37.795392	-122.394203	97.0	14th St at Mission St	37.768265	-122.420110	48	Subscriber	1954	Male	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
174773	191	2019-02-01 06:32:38.467	2019-02-01 06:35:50.222	15.0	San Francisco Ferry Building (Harry Bridges Pl...	37.795392	-122.394203	20.0	Mechanics Monument Plaza (Market St at Bush St)	37.791300	-122.399051	3108	Subscriber	1947	Male	No
174808	966	2019-02-01 05:57:01.688	2019-02-01 06:13:08.313	126.0	Esprit Park	37.761634	-122.390648	16.0	Steuart St at Market St	37.794130	-122.394430	1338	Subscriber	1952	Male	No
174853	100	2019-02-01 04:46:54.805	2019-02-01 04:48:34.843	80.0	Townsend St at 5th St	37.775235	-122.397437	67.0	San Francisco Caltrain Station 2 (Townsend St...	37.776639	-122.395526	3138	Subscriber	1950	Male	No
174926	400	2019-02-01 00:46:47.276	2019-02-01 00:53:27.596	220.0	San Pablo Ave at MLK Jr Way	37.811351	-122.273422	337.0	Webster St at 19th St	37.806970	-122.266588	3487	Subscriber	1945	Male	Yes
174929	490	2019-02-01 00:39:53.112	2019-02-01 00:48:03.338	61.0	Howard St at 8th St	37.776513	-122.411306	81.0	Berry St at 4th St	37.775880	-122.393170	5411	Subscriber	1927	Male	No

Part I - Ford GoBike Data Exploration¶

Jose Murguia¶

Introduction¶

Preliminary Wrangling¶

Data Cleaning¶

What is the structure of your dataset?¶

What is/are the main feature(s) of interest in your dataset?¶

What features in the dataset do you think will help support your investigation into your feature(s) of interest?¶

Univariate Exploration¶

Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?¶

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?¶

Bivariate Exploration¶

Duration Min¶

Distance¶

Age¶

User Type and Gender¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

Frequency of bike trips based on the day of the week and time of day¶

Time of day, Day of the week, and Duration (minutes)¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶

	duration_sec	start_time	end_time	start_station_latitude	start_station_longitude	end_station_latitude	end_station_longitude	member_birth_year
count	173272.000000	173272	173272	173272.000000	173272.000000	173272.000000	173272.000000	173272.000000
mean	703.878549	2019-02-15 21:21:32.505712128	2019-02-15 21:33:16.883208960	37.771060	-122.351657	37.771257	-122.351228	1985.171747
min	61.000000	2019-02-01 00:00:20.636000	2019-02-01 00:04:52.058000	37.317298	-122.453704	37.317298	-122.453704	1955.000000
25%	323.000000	2019-02-08 08:30:43.535000064	2019-02-08 08:41:02.230500096	37.770407	-122.411901	37.770407	-122.411647	1980.000000
50%	510.000000	2019-02-15 21:32:02.059000064	2019-02-15 21:45:20.187500032	37.780760	-122.398279	37.781010	-122.397437	1987.000000
75%	788.000000	2019-02-22 11:19:06.316000	2019-02-22 11:33:13.886749952	37.797320	-122.283093	37.797673	-122.286533	1992.000000
max	84548.000000	2019-02-28 23:59:18.548000	2019-03-01 08:01:55.975000	37.880222	-121.874119	37.880222	-121.874119	2001.000000
std	1647.305625	NaN	NaN	0.100682	0.118001	0.100587	0.117560	9.378430

	duration_sec	start_time	end_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	member_birth_year	member_gender	bike_share_for_all_trip	duration_min	start_hour
0	52185	2019-02-28 17:32:10.145	2019-03-01 08:01:55.975	21.0	Montgomery St BART Station (Market St at 2nd St)	37.789625	-122.400811	13.0	Commercial St at Montgomery St	37.794231	-122.402923	4902	Customer	1984	Male	No	869.750000	17
1	61854	2019-02-28 12:13:13.218	2019-03-01 05:24:08.146	86.0	Market St at Dolores St	37.769305	-122.426826	3.0	Powell St BART Station (Market St at 4th St)	37.786375	-122.404904	5905	Customer	1972	Male	No	1030.900000	12
2	36490	2019-02-28 17:54:26.010	2019-03-01 04:02:36.842	375.0	Grove St at Masonic Ave	37.774836	-122.446546	70.0	Central Ave at Fell St	37.773311	-122.444293	6638	Subscriber	1989	Other	No	608.166667	17
3	1585	2019-02-28 23:54:18.549	2019-03-01 00:20:44.074	7.0	Frank H Ogawa Plaza	37.804562	-122.271738	222.0	10th Ave at E 15th St	37.792714	-122.248780	4898	Subscriber	1974	Male	Yes	26.416667	23
4	1793	2019-02-28 23:49:58.632	2019-03-01 00:19:51.760	93.0	4th St at Mission Bay Blvd S	37.770407	-122.391198	323.0	Broadway at Kearny	37.798014	-122.405950	5200	Subscriber	1959	Male	No	29.883333	23