You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
2.4 KiB
C++

#pragma once
#include <iostream>
#include <vector>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <stdint.h>
#include <fstream>
using namespace std;
typedef vector<string> phrase;
bool isWordChar(char a) {
return (('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z') || ('0' <= a && a <= '9') || a == '@');
}
bool isPhraseBreak(char a) {
return ((a == '!') || (a == ',') || (a == '.') || (a == ':') || (a == ';') || (a == '?') || (a == '(') || (a == ')') || (a == '/') || (a == '\\') || (a == '\n') || (a == '[') || (a == ']') || (a == '_') || (a == '{') || (a == '}') || (a == 126 /* tilde */));
}
unordered_set<string> getStopWords() {
unordered_set<string> swords;
ifstream stop_words("stop_words.txt");
string line;
while (getline(stop_words, line)) {
swords.insert(line);
}
return swords;
}
char asciitolower(char in) {
if (in <= 'Z' && in >= 'A')
return in - ('Z' - 'z');
return in;
}
vector<phrase> parseFile (string text) {
for (size_t i = 0; i < text.size(); i++)
{
text[i] = asciitolower(text[i]);
}
unordered_set<string> stop_words = getStopWords();
// prendo un vettore, raggruppo caratteri tra spazi in stringhe fino a punteggiatura, raggruppo queste stringhe in periodi che inserisco in un nuovo vettore
vector<phrase> phrases;
phrase phrase;
string word;
for (int i = 0; i < text.size(); i++) {
if (isWordChar(text[i])) {
word.append(1, text[i]);
}
else if (isPhraseBreak(text[i])) {
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
phrase.push_back(word);
}
word = "";
if (phrase.size() > 0) {
phrases.push_back(phrase);
phrase.clear();
}
}
else {
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
phrase.push_back(word);
}
word = "";
}
}
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
phrase.push_back(word);
}
if (phrase.size() > 0) {
phrases.push_back(phrase);
phrase.clear();
}
word = "";
return phrases;
}