import requests
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()
documents = []
for course in documents_raw:
course_name = course['course']
for doc in course['documents']:
doc['course'] = course_name
documents.append(doc)LLM Zoomcamp 2025: Module 1 Homework
1 Problem 1
Run Elastic Search 8.17.6
docker run -it \
--rm \
--name elasticsearch \
-m 4GB \
-p 9200:9200 \
-p 9300:9300 \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=false" \
docker.elastic.co/elasticsearch/elasticsearch:8.17.6curl localhost:9200The version.build_hash value is "dbcbbbd0bc4924cfeb28929dc05d82d662c527b7"
2 Getting the data
3 Problem 2
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
es_client = Elasticsearch('http://localhost:9200')
index_settings = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"properties": {
"text": {"type": "text"},
"section": {"type": "text"},
"question": {"type": "text"},
"course": {"type": "keyword"}
}
}
}
index_name = "course-questions"
# Check if the index already exists
if es_client.indices.exists(index=index_name):
es_client.indices.delete(index=index_name)
# Create the index
es_client.indices.create(index=index_name, body=index_settings)
# Build the index
for doc in tqdm(documents):
es_client.index(index=index_name, document=doc)The function used for adding data to elastic is index
4 Problem 3
def elastic_search_3(query):
search_query = {
"size": 5,
"query": {
"bool": {
"must": {
"multi_match": {
"query": query,
"fields": ["question^4", "text"],
"type": "best_fields"
}
},
}
}
}
response = es_client.search(index=index_name, body=search_query)
result_docs = []
for hit in response['hits']['hits']:
result_docs.append(hit['_source'])
if response['hits']['total']['value'] > 0:
top_score = response['hits']['hits'][0]['_score']
else:
top_score = 0
return result_docs, top_score
docs_3, top_score_3 = elastic_search_3("How do execute a command on a Kubernetes pod?")
print(f"Top score: {top_score_3}")Top score: 42.848103
5 Problem 4
def elastic_search_4(query):
search_query = {
"size": 3,
"query": {
"bool": {
"must": {
"multi_match": {
"query": query,
"fields": ["question^4", "text"],
"type": "best_fields"
}
},
"filter": {
"term": {
"course": "machine-learning-zoomcamp"
}
}
}
}
}
response = es_client.search(index=index_name, body=search_query)
result_docs = []
for hit in response['hits']['hits']:
result_docs.append(hit['_source'])
if response['hits']['total']['value'] > 0:
top_score = response['hits']['hits'][0]['_score']
else:
top_score = 0
return result_docs, top_score
docs_4, top_score_4 = elastic_search_4("How do copy a file to a Docker container?")
print(f"The 3rd question returned is: {docs_4[2]['question']}")The 3rd question returned is: How do I copy files from a different folder into docker container’s working directory?
6 Problem 5
def build_prompt(query, search_results):
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT:
{context}
""".strip()
context_template = """
Q: {question}
A: {text}
""".strip()
# Build context using the new context_template
context_entries = []
for doc in search_results:
entry = context_template.format(question=doc["question"], text=doc["text"])
context_entries.append(entry)
context = "\n\n".join(context_entries)
# Format the final prompt
prompt = prompt_template.format(question=query, context=context).strip()
return prompt
prompt = build_prompt("How do I execute a command in a running docker container?", docs_4)
print(f"The length of the resulting prompt is: {len(prompt)}")The length of the resulting prompt is: 1462
7 Problem 5
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
num_tokens = len(encoding.encode(prompt))
print(f"The length of tokens is: {num_tokens}")The length of tokens is: 322