import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Download the datasets
= pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
yellow_01_23 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet") yellow_02_23
MLOps Zoomcamp 2025: Module 1 Homework
1 Setup
2 Problem 1
print(f"Number of columns: {len(yellow_01_23.columns)}")
Number of columns: 19
3 Problem 2
= yellow_01_23.tpep_dropoff_datetime - yellow_01_23.tpep_pickup_datetime
duration = (duration.dt.total_seconds() / 60).astype(float)
duration_min print(f"Standard deviation of trip duration (minutes): {duration_min.std()}")
Standard deviation of trip duration (minutes): 42.59435124195458
4 Problem 3
= len(duration)
initial_recs = len(duration_min[(duration_min >= 1) & (duration_min <= 60)])
final_recs print(f"Fraction of valid records: {final_recs / initial_recs:.2f}")
Fraction of valid records: 0.98
5 Problem 4
#Create duration column as minutes
= yellow_01_23.copy()
df_train "duration"] = duration_min
df_train[= df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
df_train
# Convert categorical variables to strings and vectorize them
= ['PULocationID', 'DOLocationID']
categorical = df_train[categorical].astype(str)
df_train[categorical] = df_train[categorical].to_dict(orient='records')
train_dicts
# Get feature matrix
= DictVectorizer()
dv = dv.fit_transform(train_dicts)
x_train print(f"Number of columns: {x_train.shape[1]}")
Number of columns: 515
6 Problem 5
# Train the LR model
= df_train['duration'].values
y_train = LinearRegression()
lr
lr.fit(x_train, y_train)
# Evaluate the model on the training data
= lr.predict(x_train)
y_pred_train = mean_squared_error(y_train, y_pred_train)
train_rmse print(f"Training RMSE: {train_rmse**0.5:.2f}")
Training RMSE: 7.65
7 Problem 6
# Create the validation dataset
= yellow_02_23.copy()
df_val = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
duration_val = (duration_val.dt.total_seconds() / 60).astype(float)
duration_val_min "duration"] = duration_val_min
df_val[= df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val = df_val[categorical].astype(str)
df_val[categorical]
# Feature matrix using encodings from our training set
= df_val[categorical].to_dict(orient='records')
val_dicts = dv.transform(val_dicts)
x_val = df_val['duration'].values
y_val
# Predict and evaluate the model on the validation data
= lr.predict(x_val)
y_pred_val = mean_squared_error(y_val, y_pred_val)
val_rmse print(f"Validation RMSE: {val_rmse**0.5:.2f}")
Validation RMSE: 7.81