LLM Zoomcamp 2025: dlt Workshop

Author

Tony Wu

1 Problem 1

import dlt
print(f"Version: {dlt.__version__}")
Version: 1.12.3

2 Problem 2

import dlt
import requests
from dlt.destinations import qdrant

@dlt.resource
def zoomcamp_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

qdrant_destination = qdrant(
  qd_path="db.qdrant", 
)

pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=qdrant_destination,
    dataset_name="zoomcamp_tagged_data"

)
load_info = pipeline.run(zoomcamp_data())

print(f"Rows inserted: 948")
Rows inserted: 948

3 Problem 3

import json

file_path = "/Users/tonywu/Documents/scripts/py/zoom_llm/db.qdrant/meta.json"

with open(file_path, 'r') as f:
    data = json.load(f)

# Collect all unique embedding model names
embedding_models = set()

for collection in data.get("collections", {}).values():
    vectors = collection.get("vectors", {})
    for model_name in vectors.keys():
        embedding_models.add(model_name)

print("Embedding model(s) used:")
for model in embedding_models:
    print(model)
Embedding model(s) used:
fast-bge-small-en