@ -6,7 +6,7 @@ import requests
import json
OUTPUT_FILE = " conference s.json"
OUTPUT_FILE = " result s.json"
HTML_EXAMPLE = r """ <p><a href= " http://www.crm.sns.it/event/507/ " target= " _blank " rel= " noreferrer noopener " >Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.</p> """
@ -31,11 +31,11 @@ def translate_to_json(conference_html: str) -> str:
} ,
{ " role " : " user " , " content " : " INPUT: " } ,
{ " role " : " user " , " content " : HTML_EXAMPLE } ,
{ " role " : " user " , " content " : " OUTPUT :" } ,
{ " role " : " user " , " content " : " OUTPUT JSON :" } ,
{ " role " : " user " , " content " : OUTPUT_EXAMPLE } ,
{ " role " : " user " , " content " : " INPUT: " } ,
{ " role " : " user " , " content " : conference_html } ,
{ " role " : " user " , " content " : " OUTPUT :" } ,
{ " role " : " user " , " content " : " OUTPUT JSON :" } ,
] ,
)
@ -78,9 +78,6 @@ page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [ snippet for link in page_urls for snippet in crawl_page ( link ) ]
# Log the conference snippets to a file
json . dump ( conference_html_snippets , open ( " paragraphs-html.json " , " w " ) )
# Load the model and, set the chat format and use the default model context length
llm = Llama (
model_path = " ./mistral-7b-instruct-v0.2.Q4_K_M.gguf " ,
@ -92,8 +89,6 @@ llm = Llama(
# the result file is a sequence of json objects, one per line
results_file = open ( OUTPUT_FILE , " w " )
failed_json = open ( " failed.json " , " w " )
for conference_html in conference_html_snippets :
print ( " -------------------------------------------------- " )
print ( " Translating: " )
@ -101,6 +96,13 @@ for conference_html in conference_html_snippets:
conference_json = translate_to_json ( conference_html )
result = {
" input_html " : conference_html ,
" raw_output " : conference_json ,
# "json": None,
# "success": False,
}
print ( " Result: " )
print ( conference_json )
@ -108,14 +110,17 @@ for conference_html in conference_html_snippets:
# parse the result string into a json object to check correctness
conference_object = json . loads ( conference_json )
json . dump ( conference_object , results_file )
results_file . write ( " \n " )
results_file . flush ( )
result [ " success " ] = True
result [ " json " ] = conference_object
except :
print ( " > json is invalid, skipping " )
json . dump ( conference_json , failed_json )
failed_json . write ( " \n " )
failed_json . flush ( )
result [ " success " ] = False
json . dump ( result , results_file )
results_file . write ( " \n " )
results_file . flush ( )
results_file . close ( )
failed_json . close ( )