import requests
= 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_url = requests.get(docs_url)
docs_response = docs_response.json()
documents_raw
= []
documents
for course in documents_raw:
= course['course']
course_name
for doc in course['documents']:
'course'] = course_name
doc[ documents.append(doc)
LLM Zoomcamp 2025: Module 1 Homework
1 Problem 1
Run Elastic Search 8.17.6
docker run -it \
--rm \
--name elasticsearch \
-m 4GB \
-p 9200:9200 \
-p 9300:9300 \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=false" \
docker.elastic.co/elasticsearch/elasticsearch:8.17.6
curl localhost:9200
The version.build_hash
value is "dbcbbbd0bc4924cfeb28929dc05d82d662c527b7"
2 Getting the data
3 Problem 2
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
= Elasticsearch('http://localhost:9200')
es_client = {
index_settings "settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},"mappings": {
"properties": {
"text": {"type": "text"},
"section": {"type": "text"},
"question": {"type": "text"},
"course": {"type": "keyword"}
}
}
}
= "course-questions"
index_name
# Check if the index already exists
if es_client.indices.exists(index=index_name):
=index_name)
es_client.indices.delete(index
# Create the index
=index_name, body=index_settings)
es_client.indices.create(index
# Build the index
for doc in tqdm(documents):
=index_name, document=doc) es_client.index(index
The function used for adding data to elastic is index
4 Problem 3
def elastic_search_3(query):
= {
search_query "size": 5,
"query": {
"bool": {
"must": {
"multi_match": {
"query": query,
"fields": ["question^4", "text"],
"type": "best_fields"
}
},
}
}
}
= es_client.search(index=index_name, body=search_query)
response
= []
result_docs
for hit in response['hits']['hits']:
'_source'])
result_docs.append(hit[
if response['hits']['total']['value'] > 0:
= response['hits']['hits'][0]['_score']
top_score else:
= 0
top_score
return result_docs, top_score
= elastic_search_3("How do execute a command on a Kubernetes pod?")
docs_3, top_score_3 print(f"Top score: {top_score_3}")
Top score: 42.848103
5 Problem 4
def elastic_search_4(query):
= {
search_query "size": 3,
"query": {
"bool": {
"must": {
"multi_match": {
"query": query,
"fields": ["question^4", "text"],
"type": "best_fields"
}
},"filter": {
"term": {
"course": "machine-learning-zoomcamp"
}
}
}
}
}
= es_client.search(index=index_name, body=search_query)
response
= []
result_docs
for hit in response['hits']['hits']:
'_source'])
result_docs.append(hit[
if response['hits']['total']['value'] > 0:
= response['hits']['hits'][0]['_score']
top_score else:
= 0
top_score
return result_docs, top_score
= elastic_search_4("How do copy a file to a Docker container?")
docs_4, top_score_4 print(f"The 3rd question returned is: {docs_4[2]['question']}")
The 3rd question returned is: How do I copy files from a different folder into docker container’s working directory?
6 Problem 5
def build_prompt(query, search_results):
= """
prompt_template You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT:
{context}
""".strip()
= """
context_template Q: {question}
A: {text}
""".strip()
# Build context using the new context_template
= []
context_entries for doc in search_results:
= context_template.format(question=doc["question"], text=doc["text"])
entry
context_entries.append(entry)
= "\n\n".join(context_entries)
context
# Format the final prompt
= prompt_template.format(question=query, context=context).strip()
prompt return prompt
= build_prompt("How do I execute a command in a running docker container?", docs_4)
prompt print(f"The length of the resulting prompt is: {len(prompt)}")
The length of the resulting prompt is: 1462
7 Problem 5
import tiktoken
= tiktoken.encoding_for_model("gpt-4o")
encoding = len(encoding.encode(prompt))
num_tokens print(f"The length of tokens is: {num_tokens}")
The length of tokens is: 322