MLOps Zoomcamp 2025: Module 1 Homework

Author

Tony Wu

1 Setup

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Download the datasets
yellow_01_23 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
yellow_02_23 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

2 Problem 1

print(f"Number of columns: {len(yellow_01_23.columns)}")
Number of columns: 19

3 Problem 2

duration = yellow_01_23.tpep_dropoff_datetime - yellow_01_23.tpep_pickup_datetime
duration_min = (duration.dt.total_seconds() / 60).astype(float)
print(f"Standard deviation of trip duration (minutes): {duration_min.std()}")
Standard deviation of trip duration (minutes): 42.59435124195458

4 Problem 3

initial_recs = len(duration)
final_recs = len(duration_min[(duration_min >= 1) & (duration_min <= 60)])
print(f"Fraction of valid records: {final_recs / initial_recs:.2f}")
Fraction of valid records: 0.98

5 Problem 4

#Create duration column as minutes
df_train = yellow_01_23.copy()
df_train["duration"] = duration_min
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

# Convert categorical variables to strings and vectorize them
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype(str)
train_dicts = df_train[categorical].to_dict(orient='records')

# Get feature matrix
dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)
print(f"Number of columns: {x_train.shape[1]}")
Number of columns: 515

6 Problem 5

# Train the LR model
y_train = df_train['duration'].values
lr = LinearRegression()
lr.fit(x_train, y_train)

# Evaluate the model on the training data
y_pred_train = lr.predict(x_train)
train_rmse = mean_squared_error(y_train, y_pred_train)
print(f"Training RMSE: {train_rmse**0.5:.2f}")
Training RMSE: 7.65

7 Problem 6

# Create the validation dataset
df_val = yellow_02_23.copy()
duration_val = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
duration_val_min = (duration_val.dt.total_seconds() / 60).astype(float)
df_val["duration"] = duration_val_min
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical] = df_val[categorical].astype(str)

# Feature matrix using encodings from our training set
val_dicts = df_val[categorical].to_dict(orient='records')
x_val = dv.transform(val_dicts)
y_val = df_val['duration'].values

# Predict and evaluate the model on the validation data
y_pred_val = lr.predict(x_val)
val_rmse = mean_squared_error(y_val, y_pred_val)
print(f"Validation RMSE: {val_rmse**0.5:.2f}")
Validation RMSE: 7.81