From 8c6364277e14334db4187f48e348a69359a86c10 Mon Sep 17 00:00:00 2001 From: Antonio De Lucreziis Date: Tue, 30 Jan 2024 00:01:50 +0100 Subject: [PATCH] initial commit, reworked @BachoSeven's code --- .gitignore | 11 +++++ README.md | 40 +++++++++++++++ main.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 14 ++++++ 4 files changed, 191 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83acb35 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Local files +*.local* + +# Python +venv/ + +# Editors +.vscode/ + +# LLM Models +*.gguf \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..676fd0b --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Past Conferences Crawler (with LLM) + +A Python script that crawls conferences from and processes them using a local-run LLM (we used [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)) to translate the natural language info to a more structured format (json). + +## Installation + +Download the LLM model to use, specifically we use [`Mistral-7B-Instruct-v0.2-GGUF`](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF): + +```bash +# download the model, ~4GB +$ wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf +``` + +Install python the requirements: + +```bash +# if you want create a venv +$ python -m venv venv +$ source venv/bin/activate + +# enable gpu support for llama-cpp +$ export CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" + +# install requirements +$ pip install -r requirements.txt +``` + +## Launch + +The following command will crawl the conferences from `https://www.dm.unipi.it/research/past-conferences/` (pages 1 to 5) and save the results in `conferences.json`: + +```bash +$ python main.py +``` + +The output is a list of json objects, one per line. To display the results with `jq`: + +```bash +$ jq -s '.' conferences.json +``` \ No newline at end of file diff --git a/main.py b/main.py new file mode 100755 index 0000000..d71f464 --- /dev/null +++ b/main.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +from llama_cpp import Llama +import requests +from bs4 import BeautifulSoup +import textwrap +import json + +LLM_EXAMPLE = ( + "INPUT:\n" + '

Statistical' + " and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage" + " (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja" + " Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13" + " – 16, 2022.

\n" + "\n" + "OUTPUT (JSON): \n" + "{" + '"title": "Statistical and Computational Aspects of Dynamics",' + '"link": "http://www.crm.sns.it/event/507/", ' + '"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica' + " Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien," + ' Austria). Centro De Giorgi", ' + '"organizers": [' + '"Buddhima Kasun Fernando Akurugodage", ' + '"Paolo Giulietti", ' + '"Tanja Isabelle Schindler"' + "]" + '"location": "SNS, Pisa", ' + '"date": "December 13 – 16, 2022.", ' + '"start_date": "2022-12-13", ' + '"end_date": "2022-12-16"' + "}\n" + "\n" + "INPUT:\n" +) + + +def translate_to_json(conference_html: str) -> str: + llm_answer = llm.create_chat_completion( + max_tokens=None, + messages=[ + { + "role": "system", + "content": "You are an assistant aiding a software developer. Be precise in formatting the output correctly as requested", + }, + {"role": "user", "content": LLM_EXAMPLE}, + {"role": "user", "content": conference_html}, + {"role": "user", "content": "OUTPUT (JSON):"}, + ], + ) + + return llm_answer["choices"][0]["message"]["content"] + + +def crawl_page(url): + print(f"Crawling {url}") + r = requests.get(url) + + if r.status_code == 200: + html = r.text + soup = BeautifulSoup(html, "html.parser") + + # Find the two elements + h2 = soup.find("h2", class_="wp-block-heading") + div = soup.find("div", class_="page-links") + + # Extract all the elements between h2_tag and div_tag + if h2 and div: + result = [] + + current = h2.find_next_sibling() + while current and current != div: + if current.name is not None and current.text.strip(): + result.append(str(current)) + current = current.find_next_sibling() + + print(f"Found {len(result)} conferences") + return result + else: + raise Exception("Failed to find elements") + + else: + raise Exception("Failed to fetch") + + +baseurl = "https://www.dm.unipi.it/research/past-conferences/" +page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)] + +conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] + +print("LLM Example Context:") +print(LLM_EXAMPLE) + +exit(1) + +# Load the model and, set the chat format and use the default model context length +llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0) + + +# clear the result file +open("conferences.json", "w").close() + +# the result file is a sequence of json objects, one per line +results_file = open("conferences.json", "a") + +for conference_html in conference_html_snippets: + print("Translating:") + print(conference_html) + + conference_json = translate_to_json(conference_html) + + print("Result:") + print(conference_json) + + try: + # parse the result string into a json object to check correctness + conference_object = json.loads(conference_json) + json.dump(conference_object, results_file) + results_file.write("\n") + results_file.flush() + except: + print("> json is invalid, skipping") + print(conference_json) + +results_file.close() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b9c970 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +beautifulsoup4==4.12.3 +bs4==0.0.2 +certifi==2023.11.17 +charset-normalizer==3.3.2 +diskcache==5.6.3 +idna==3.6 +Jinja2==3.1.3 +llama_cpp_python==0.2.36 +MarkupSafe==2.1.4 +numpy==1.26.3 +requests==2.31.0 +soupsieve==2.5 +typing_extensions==4.9.0 +urllib3==2.1.0