#pragma once #include #include #include #include #include #include #include using namespace std; typedef vector phrase; bool isWordChar(char a) { return (('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z') || ('0' <= a && a <= '9') || a == '@'); } bool isPhraseBreak(char a) { return ((a == '!') || (a == ',') || (a == '.') || (a == ':') || (a == ';') || (a == '?') || (a == '(') || (a == ')') || (a == '/') || (a == '\\') || (a == '\n') || (a == '[') || (a == ']') || (a == '_') || (a == '{') || (a == '}') || (a == 126 /* tilde */)); } unordered_set getStopWords() { unordered_set swords; ifstream stop_words("stop_words.txt"); string line; while (getline(stop_words, line)) { swords.insert(line); } return swords; } char asciitolower(char in) { if (in <= 'Z' && in >= 'A') return in - ('Z' - 'z'); return in; } vector parseFile (string text) { for (size_t i = 0; i < text.size(); i++) { text[i] = asciitolower(text[i]); } unordered_set stop_words = getStopWords(); // prendo un vettore, raggruppo caratteri tra spazi in stringhe fino a punteggiatura, raggruppo queste stringhe in periodi che inserisco in un nuovo vettore vector phrases; phrase phrase; string word; for (int i = 0; i < text.size(); i++) { if (isWordChar(text[i])) { word.append(1, text[i]); } else if (isPhraseBreak(text[i])) { if (word.size() > 0 && stop_words.find(word) == stop_words.end()) { phrase.push_back(word); } word = ""; if (phrase.size() > 0) { phrases.push_back(phrase); phrase.clear(); } } else { if (word.size() > 0 && stop_words.find(word) == stop_words.end()) { phrase.push_back(word); } word = ""; } } if (word.size() > 0 && stop_words.find(word) == stop_words.end()) { phrase.push_back(word); } if (phrase.size() > 0) { phrases.push_back(phrase); phrase.clear(); } word = ""; return phrases; }