From 84aee131e0293ecc396ce496d12d967ea7b6b161 Mon Sep 17 00:00:00 2001 From: Antonio De Lucreziis Date: Thu, 1 Feb 2024 15:56:22 +0100 Subject: [PATCH] fix: boh --- main.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 2eaf721..854c039 100755 --- a/main.py +++ b/main.py @@ -74,7 +74,7 @@ def crawl_page(url): baseurl = "https://www.dm.unipi.it/research/past-conferences/" -page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)] +page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] @@ -82,11 +82,18 @@ conference_html_snippets = [snippet for link in page_urls for snippet in crawl_p json.dump(conference_html_snippets, open("paragraphs-html.json", "w")) # Load the model and, set the chat format and use the default model context length -llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0) +llm = Llama( + model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", + chat_format="llama-2", + verbose=False, + n_ctx=0, +) # the result file is a sequence of json objects, one per line results_file = open(OUTPUT_FILE, "w") +failed_json = open("failed.json", "w") + for conference_html in conference_html_snippets: print("--------------------------------------------------") print("Translating:") @@ -100,11 +107,15 @@ for conference_html in conference_html_snippets: try: # parse the result string into a json object to check correctness conference_object = json.loads(conference_json) + json.dump(conference_object, results_file) results_file.write("\n") results_file.flush() except: print("> json is invalid, skipping") - print(conference_json) + + json.dump(conference_json, failed_json) + failed_json.write("\n") + failed_json.flush() results_file.close()