diff --git a/main.py b/main.py index 854c039..2634469 100755 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ import requests import json -OUTPUT_FILE = "conferences.json" +OUTPUT_FILE = "results.json" HTML_EXAMPLE = r"""

Statistical and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.

""" @@ -31,11 +31,11 @@ def translate_to_json(conference_html: str) -> str: }, {"role": "user", "content": "INPUT:"}, {"role": "user", "content": HTML_EXAMPLE }, - {"role": "user", "content": "OUTPUT:"}, + {"role": "user", "content": "OUTPUT JSON:"}, {"role": "user", "content": OUTPUT_EXAMPLE}, {"role": "user", "content": "INPUT:"}, {"role": "user", "content": conference_html }, - {"role": "user", "content": "OUTPUT:"}, + {"role": "user", "content": "OUTPUT JSON:"}, ], ) @@ -78,9 +78,6 @@ page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] -# Log the conference snippets to a file -json.dump(conference_html_snippets, open("paragraphs-html.json", "w")) - # Load the model and, set the chat format and use the default model context length llm = Llama( model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", @@ -92,8 +89,6 @@ llm = Llama( # the result file is a sequence of json objects, one per line results_file = open(OUTPUT_FILE, "w") -failed_json = open("failed.json", "w") - for conference_html in conference_html_snippets: print("--------------------------------------------------") print("Translating:") @@ -101,21 +96,31 @@ for conference_html in conference_html_snippets: conference_json = translate_to_json(conference_html) + result = { + "input_html": conference_html, + "raw_output": conference_json, + # "json": None, + # "success": False, + } + print("Result:") print(conference_json) try: # parse the result string into a json object to check correctness conference_object = json.loads(conference_json) - - json.dump(conference_object, results_file) - results_file.write("\n") - results_file.flush() + + result["success"] = True + result["json"] = conference_object except: print("> json is invalid, skipping") - json.dump(conference_json, failed_json) - failed_json.write("\n") - failed_json.flush() + result["success"] = False + + json.dump(result, results_file) + results_file.write("\n") + results_file.flush() + results_file.close() +failed_json.close()