main
Antonio De Lucreziis 10 months ago
parent f160c8200a
commit 84aee131e0

@ -74,7 +74,7 @@ def crawl_page(url):
baseurl = "https://www.dm.unipi.it/research/past-conferences/" baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)] page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
@ -82,11 +82,18 @@ conference_html_snippets = [snippet for link in page_urls for snippet in crawl_p
json.dump(conference_html_snippets, open("paragraphs-html.json", "w")) json.dump(conference_html_snippets, open("paragraphs-html.json", "w"))
# Load the model and, set the chat format and use the default model context length # Load the model and, set the chat format and use the default model context length
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0) llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line # the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w") results_file = open(OUTPUT_FILE, "w")
failed_json = open("failed.json", "w")
for conference_html in conference_html_snippets: for conference_html in conference_html_snippets:
print("--------------------------------------------------") print("--------------------------------------------------")
print("Translating:") print("Translating:")
@ -100,11 +107,15 @@ for conference_html in conference_html_snippets:
try: try:
# parse the result string into a json object to check correctness # parse the result string into a json object to check correctness
conference_object = json.loads(conference_json) conference_object = json.loads(conference_json)
json.dump(conference_object, results_file) json.dump(conference_object, results_file)
results_file.write("\n") results_file.write("\n")
results_file.flush() results_file.flush()
except: except:
print("> json is invalid, skipping") print("> json is invalid, skipping")
print(conference_json)
json.dump(conference_json, failed_json)
failed_json.write("\n")
failed_json.flush()
results_file.close() results_file.close()

Loading…
Cancel
Save