main
Antonio De Lucreziis 10 months ago
parent f160c8200a
commit 84aee131e0

@ -74,7 +74,7 @@ def crawl_page(url):
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
@ -82,11 +82,18 @@ conference_html_snippets = [snippet for link in page_urls for snippet in crawl_p
json.dump(conference_html_snippets, open("paragraphs-html.json", "w"))
# Load the model and, set the chat format and use the default model context length
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
failed_json = open("failed.json", "w")
for conference_html in conference_html_snippets:
print("--------------------------------------------------")
print("Translating:")
@ -100,11 +107,15 @@ for conference_html in conference_html_snippets:
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
json.dump(conference_object, results_file)
results_file.write("\n")
results_file.flush()
except:
print("> json is invalid, skipping")
print(conference_json)
json.dump(conference_json, failed_json)
failed_json.write("\n")
failed_json.flush()
results_file.close()

Loading…
Cancel
Save