|
|
|
@ -74,7 +74,7 @@ def crawl_page(url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
|
|
|
|
|
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]
|
|
|
|
|
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
|
|
|
|
|
|
|
|
|
|
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
|
|
|
|
|
|
|
|
|
@ -82,11 +82,18 @@ conference_html_snippets = [snippet for link in page_urls for snippet in crawl_p
|
|
|
|
|
json.dump(conference_html_snippets, open("paragraphs-html.json", "w"))
|
|
|
|
|
|
|
|
|
|
# Load the model and, set the chat format and use the default model context length
|
|
|
|
|
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)
|
|
|
|
|
llm = Llama(
|
|
|
|
|
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
|
|
|
|
|
chat_format="llama-2",
|
|
|
|
|
verbose=False,
|
|
|
|
|
n_ctx=0,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# the result file is a sequence of json objects, one per line
|
|
|
|
|
results_file = open(OUTPUT_FILE, "w")
|
|
|
|
|
|
|
|
|
|
failed_json = open("failed.json", "w")
|
|
|
|
|
|
|
|
|
|
for conference_html in conference_html_snippets:
|
|
|
|
|
print("--------------------------------------------------")
|
|
|
|
|
print("Translating:")
|
|
|
|
@ -100,11 +107,15 @@ for conference_html in conference_html_snippets:
|
|
|
|
|
try:
|
|
|
|
|
# parse the result string into a json object to check correctness
|
|
|
|
|
conference_object = json.loads(conference_json)
|
|
|
|
|
|
|
|
|
|
json.dump(conference_object, results_file)
|
|
|
|
|
results_file.write("\n")
|
|
|
|
|
results_file.flush()
|
|
|
|
|
except:
|
|
|
|
|
print("> json is invalid, skipping")
|
|
|
|
|
print(conference_json)
|
|
|
|
|
|
|
|
|
|
json.dump(conference_json, failed_json)
|
|
|
|
|
failed_json.write("\n")
|
|
|
|
|
failed_json.flush()
|
|
|
|
|
|
|
|
|
|
results_file.close()
|
|
|
|
|