initial commit, reworked @BachoSeven's code

main
Antonio De Lucreziis 10 months ago
commit 8c6364277e

11
.gitignore vendored

@ -0,0 +1,11 @@
# Local files
*.local*
# Python
venv/
# Editors
.vscode/
# LLM Models
*.gguf

@ -0,0 +1,40 @@
# Past Conferences Crawler (with LLM)
A Python script that crawls conferences from <https://www.dm.unipi.it/research/past-conferences/> and processes them using a local-run LLM (we used [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)) to translate the natural language info to a more structured format (json).
## Installation
Download the LLM model to use, specifically we use [`Mistral-7B-Instruct-v0.2-GGUF`](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF):
```bash
# download the model, ~4GB
$ wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
```
Install python the requirements:
```bash
# if you want create a venv
$ python -m venv venv
$ source venv/bin/activate
# enable gpu support for llama-cpp
$ export CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
# install requirements
$ pip install -r requirements.txt
```
## Launch
The following command will crawl the conferences from `https://www.dm.unipi.it/research/past-conferences/` (pages 1 to 5) and save the results in `conferences.json`:
```bash
$ python main.py
```
The output is a list of json objects, one per line. To display the results with `jq`:
```bash
$ jq -s '.' conferences.json
```

@ -0,0 +1,126 @@
#!/usr/bin/env python3
from llama_cpp import Llama
import requests
from bs4 import BeautifulSoup
import textwrap
import json
LLM_EXAMPLE = (
"INPUT:\n"
'<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical'
" and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage"
" (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja"
" Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13"
" &#8211; 16, 2022.</p>\n"
"\n"
"OUTPUT (JSON): \n"
"{"
'"title": "Statistical and Computational Aspects of Dynamics",'
'"link": "http://www.crm.sns.it/event/507/", '
'"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica'
" Ennio De Giorgi SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien,"
' Austria). Centro De Giorgi", '
'"organizers": ['
'"Buddhima Kasun Fernando Akurugodage", '
'"Paolo Giulietti", '
'"Tanja Isabelle Schindler"'
"]"
'"location": "SNS, Pisa", '
'"date": "December 13 16, 2022.", '
'"start_date": "2022-12-13", '
'"end_date": "2022-12-16"'
"}\n"
"\n"
"INPUT:\n"
)
def translate_to_json(conference_html: str) -> str:
llm_answer = llm.create_chat_completion(
max_tokens=None,
messages=[
{
"role": "system",
"content": "You are an assistant aiding a software developer. Be precise in formatting the output correctly as requested",
},
{"role": "user", "content": LLM_EXAMPLE},
{"role": "user", "content": conference_html},
{"role": "user", "content": "OUTPUT (JSON):"},
],
)
return llm_answer["choices"][0]["message"]["content"]
def crawl_page(url):
print(f"Crawling {url}")
r = requests.get(url)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, "html.parser")
# Find the two elements
h2 = soup.find("h2", class_="wp-block-heading")
div = soup.find("div", class_="page-links")
# Extract all the elements between h2_tag and div_tag
if h2 and div:
result = []
current = h2.find_next_sibling()
while current and current != div:
if current.name is not None and current.text.strip():
result.append(str(current))
current = current.find_next_sibling()
print(f"Found {len(result)} conferences")
return result
else:
raise Exception("Failed to find elements")
else:
raise Exception("Failed to fetch")
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
print("LLM Example Context:")
print(LLM_EXAMPLE)
exit(1)
# Load the model and, set the chat format and use the default model context length
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)
# clear the result file
open("conferences.json", "w").close()
# the result file is a sequence of json objects, one per line
results_file = open("conferences.json", "a")
for conference_html in conference_html_snippets:
print("Translating:")
print(conference_html)
conference_json = translate_to_json(conference_html)
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
json.dump(conference_object, results_file)
results_file.write("\n")
results_file.flush()
except:
print("> json is invalid, skipping")
print(conference_json)
results_file.close()

@ -0,0 +1,14 @@
beautifulsoup4==4.12.3
bs4==0.0.2
certifi==2023.11.17
charset-normalizer==3.3.2
diskcache==5.6.3
idna==3.6
Jinja2==3.1.3
llama_cpp_python==0.2.36
MarkupSafe==2.1.4
numpy==1.26.3
requests==2.31.0
soupsieve==2.5
typing_extensions==4.9.0
urllib3==2.1.0
Loading…
Cancel
Save