You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
88 lines
2.4 KiB
C++
88 lines
2.4 KiB
C++
#pragma once
|
|
|
|
#include <iostream>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <stdint.h>
|
|
#include <fstream>
|
|
|
|
using namespace std;
|
|
|
|
typedef vector<string> phrase;
|
|
|
|
bool isWordChar(char a) {
|
|
return (('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z') || ('0' <= a && a <= '9') || a == '@');
|
|
}
|
|
|
|
bool isPhraseBreak(char a) {
|
|
return ((a == '!') || (a == ',') || (a == '.') || (a == ':') || (a == ';') || (a == '?') || (a == '(') || (a == ')') || (a == '/') || (a == '\\') || (a == '\n') || (a == '[') || (a == ']') || (a == '_') || (a == '{') || (a == '}') || (a == 126 /* tilde */));
|
|
}
|
|
|
|
unordered_set<string> getStopWords() {
|
|
unordered_set<string> swords;
|
|
|
|
ifstream stop_words("stop_words.txt");
|
|
string line;
|
|
while (getline(stop_words, line)) {
|
|
swords.insert(line);
|
|
}
|
|
|
|
return swords;
|
|
}
|
|
|
|
char asciitolower(char in) {
|
|
if (in <= 'Z' && in >= 'A')
|
|
return in - ('Z' - 'z');
|
|
|
|
return in;
|
|
}
|
|
|
|
vector<phrase> parseFile (string text) {
|
|
|
|
for (size_t i = 0; i < text.size(); i++)
|
|
{
|
|
text[i] = asciitolower(text[i]);
|
|
}
|
|
|
|
|
|
unordered_set<string> stop_words = getStopWords();
|
|
|
|
// prendo un vettore, raggruppo caratteri tra spazi in stringhe fino a punteggiatura, raggruppo queste stringhe in periodi che inserisco in un nuovo vettore
|
|
vector<phrase> phrases;
|
|
phrase phrase;
|
|
string word;
|
|
for (int i = 0; i < text.size(); i++) {
|
|
if (isWordChar(text[i])) {
|
|
word.append(1, text[i]);
|
|
}
|
|
else if (isPhraseBreak(text[i])) {
|
|
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
|
|
phrase.push_back(word);
|
|
}
|
|
word = "";
|
|
if (phrase.size() > 0) {
|
|
phrases.push_back(phrase);
|
|
phrase.clear();
|
|
}
|
|
}
|
|
else {
|
|
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
|
|
phrase.push_back(word);
|
|
}
|
|
|
|
word = "";
|
|
}
|
|
}
|
|
if (word.size() > 0 && stop_words.find(word) == stop_words.end()) {
|
|
phrase.push_back(word);
|
|
}
|
|
if (phrase.size() > 0) {
|
|
phrases.push_back(phrase);
|
|
phrase.clear();
|
|
}
|
|
word = "";
|
|
|
|
return phrases;
|
|
} |