feat: better result file structure

main
Antonio De Lucreziis 10 months ago
parent 84aee131e0
commit cbe59be2d9

@ -6,7 +6,7 @@ import requests
import json
OUTPUT_FILE = "conferences.json"
OUTPUT_FILE = "results.json"
HTML_EXAMPLE = r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>"""
@ -31,11 +31,11 @@ def translate_to_json(conference_html: str) -> str:
},
{"role": "user", "content": "INPUT:"},
{"role": "user", "content": HTML_EXAMPLE },
{"role": "user", "content": "OUTPUT:"},
{"role": "user", "content": "OUTPUT JSON:"},
{"role": "user", "content": OUTPUT_EXAMPLE},
{"role": "user", "content": "INPUT:"},
{"role": "user", "content": conference_html },
{"role": "user", "content": "OUTPUT:"},
{"role": "user", "content": "OUTPUT JSON:"},
],
)
@ -78,9 +78,6 @@ page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
# Log the conference snippets to a file
json.dump(conference_html_snippets, open("paragraphs-html.json", "w"))
# Load the model and, set the chat format and use the default model context length
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
@ -92,8 +89,6 @@ llm = Llama(
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
failed_json = open("failed.json", "w")
for conference_html in conference_html_snippets:
print("--------------------------------------------------")
print("Translating:")
@ -101,21 +96,31 @@ for conference_html in conference_html_snippets:
conference_json = translate_to_json(conference_html)
result = {
"input_html": conference_html,
"raw_output": conference_json,
# "json": None,
# "success": False,
}
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
json.dump(conference_object, results_file)
results_file.write("\n")
results_file.flush()
result["success"] = True
result["json"] = conference_object
except:
print("> json is invalid, skipping")
json.dump(conference_json, failed_json)
failed_json.write("\n")
failed_json.flush()
result["success"] = False
json.dump(result, results_file)
results_file.write("\n")
results_file.flush()
results_file.close()
failed_json.close()

Loading…
Cancel
Save