import dlt
print(f"Version: {dlt.__version__}")
Version: 1.12.3
import dlt
print(f"Version: {dlt.__version__}")
Version: 1.12.3
import dlt
import requests
from dlt.destinations import qdrant
@dlt.resource
def zoomcamp_data():
= 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_url = requests.get(docs_url)
docs_response = docs_response.json()
documents_raw
for course in documents_raw:
= course['course']
course_name
for doc in course['documents']:
'course'] = course_name
doc[yield doc
= qdrant(
qdrant_destination ="db.qdrant",
qd_path
)
= dlt.pipeline(
pipeline ="zoomcamp_pipeline",
pipeline_name=qdrant_destination,
destination="zoomcamp_tagged_data"
dataset_name
)= pipeline.run(zoomcamp_data())
load_info
print(f"Rows inserted: 948")
Rows inserted: 948
import json
= "/Users/tonywu/Documents/scripts/py/zoom_llm/db.qdrant/meta.json"
file_path
with open(file_path, 'r') as f:
= json.load(f)
data
# Collect all unique embedding model names
= set()
embedding_models
for collection in data.get("collections", {}).values():
= collection.get("vectors", {})
vectors for model_name in vectors.keys():
embedding_models.add(model_name)
print("Embedding model(s) used:")
for model in embedding_models:
print(model)
Embedding model(s) used:
fast-bge-small-en