fix: boh

9 months ago · 84aee131e0
parent f160c8200a
commit 84aee131e0
1 changed files with 14 additions and 3 deletions
--- a/main.py
+++ b/main.py
@ -74,7 +74,7 @@ def crawl_page(url):


 baseurl = "https://www.dm.unipi.it/research/past-conferences/"
-page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]
+page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]

 conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]

@ -82,11 +82,18 @@ conference_html_snippets = [snippet for link in page_urls for snippet in crawl_p
 json.dump(conference_html_snippets, open("paragraphs-html.json", "w"))

 # Load the model and, set the chat format and use the default model context length
-llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)
+llm = Llama(
+    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", 
+    chat_format="llama-2", 
+    verbose=False,
+    n_ctx=0,
+)

 # the result file is a sequence of json objects, one per line
 results_file = open(OUTPUT_FILE, "w")

+failed_json = open("failed.json", "w")
+
 for conference_html in conference_html_snippets:
    print("--------------------------------------------------")
    print("Translating:")
@ -100,11 +107,15 @@ for conference_html in conference_html_snippets:
    try:
        # parse the result string into a json object to check correctness
        conference_object = json.loads(conference_json)
+        
        json.dump(conference_object, results_file)
        results_file.write("\n")
        results_file.flush()
    except:
        print("> json is invalid, skipping")
-        print(conference_json)
+
+        json.dump(conference_json, failed_json)
+        failed_json.write("\n")
+        failed_json.flush()

 results_file.close()