我正在尝试在csv文件中找到前k个最常见的单词。原始文件是csv,并且有超过1M行,因此我跳过了一些阶段来关注问题区域。
在我解决标点符号之前,请将其全部小写,以便在测试文本中仅包含单词和数字,并且在解析数据时也跳过数字。
这是我的代码: 标头->
#pragma once #include <iostream> #include <fstream> #include <string> #include <assert.h> #include <stdlib.h> using namespace std; struct sitecount { string name = " "; int count = 0; bool first_ten_rank = 0; }; class site { private: int size = 14717; sitecount site_placement[14717]; sitecount* rankings[10]; public: site(); site(string); void add(string); void sort_rank(); void check_ten(int); void print_ten() const; int getSize(); int find(string) const; int dh(string, int) const; int dhash1(string) const; int dhash2(string) const; };
cpp->
site::site() {} int site::dh(string n, int i) const { return abs(dhash1(n) + i * dhash2(n)) % size; } int site::dhash1(string name) const { int site_res = 7; for (int i = 0; i < name.length(); i++) site_res = (site_res * 31 + name[i]) % 1000000; return abs(site_res) % size; } int site::getSize() { return size; } int site::dhash2(string name) const { int site_res = 7; for (int i = 0; i < name.length(); i++) site_res = (site_res * 31 + name[i]) % 1000000; return 1 + (abs(site_res) % (size - 1)); } int site::find(string name) const { int i = 0; int check_pl = dh(name, i); while (site_placement[check_pl].count != 0 || i == size) { if (site_placement[check_pl].name == name) return check_pl; i++; check_pl = dh(name, i); } return -1; } void site::add(string name) { int first_check = find(name); if (first_check == -1) { int i = 0; int place = dh(name, i); while (site_placement[place].count != 0) { i += 1; place = dh(name, i); } site_placement[place].name = name; site_placement[place].count = 1; check_ten(place); } else { site_placement[first_check].count++; check_ten(first_check); } } void site::check_ten(int place) { if (site_placement[place].first_ten_rank) { sort_rank(); return; } else if (rankings[0]->count > site_placement[place].count) return; rankings[0] = &site_placement[place]; site_placement[place].first_ten_rank = 1; sort_rank(); } void site::print_ten() const { cout << "RANKINGS" << "- - -" << "SITE" << "- - -" << "HIT" << endl; for (int i = 9; i > -1; i--) cout << 10 - i << "-)" << "- - -" << rankings[i]->name << "- - -" << rankings[i]->count << "- - -" << endl; } void site::sort_rank() { sitecount* temp; for (int i = 1; i < 10; i++) { int j = i; while (j > 0 && (rankings[j - 1]->count) > (rankings[j]->count)) { temp = rankings[j]; rankings[j] = rankings[j - 1]; rankings[j - 1] = temp; j--; } } } site::site(string file_name) { for (int i = 0; i < 10; i++) rankings[i] = &site_placement[i]; ifstream a; string s; s.clear(); a.open(file_name.c_str()); assert(a.is_open() == 1 && "File could not be found"); string one, two, three, four, five, six, seven, eight, nine, zero; one = "1"; two = "2"; three = "3"; four = "4"; five = "5"; six = "6"; seven ="7"; eight = "8"; nine = "9"; zero = "0"; while (a>>s ) { if(!(s.length()==0)&& s.compare(one)&& s.compare(two)&& s.compare(three)&& s.compare(four)&& s.compare(five) && s.compare(six)&& s.compare(seven)&& s.compare(eight)&& s.compare(nine)&&s.compare(zero)) add(s); } a.close(); }
main --->
#include "site.h" #include <time.h> using namespace std; int main() { const clock_t begin_time = clock(); site my_site("output.txt"); my_site.print_ten(); clock_t end_time2 = clock(); cout << "It took : " << end_time2 - begin_time << " milliseconds" << endl; system("PAUSE"); return 0; }
文本文件-> https://textuploader.com/d8xwi!
My results
1) love--31
2) kindle2--20
3) latex--10
4) tek--8
5) lt3--5
6) cool--4
7) lot--4
8) blah--3
9) card--3
10)favorite--2
True results
time 48
night 37
good 34
warner 34
love 31
museum 26
nike 26
im 26
gm 22
jquery 21
twitter 20
lebron 20
great 20
google 20
safeway 20
kindle2 20
hate 19
rt 19
today 19
watch 18
api 16
day 15
amp 15
atampt 15
work 14
答案 0 :(得分:0)
将网站从前十名列表中删除后,其标志first_ten_rank
必须重置为零,否则它将再也不会进入前十名。
void site::check_ten(int place)
{
if (site_placement[place].first_ten_rank)
{
sort_rank();
return;
}
else if (rankings[0]->count > site_placement[place].count)
return;
//When a site is removed from the top ten list, its first_ten_rank must be set to zero.
//otherwise, a removed one from topten list will never enter the topten again
rankings[0]->first_ten_rank = 0;
rankings[0] = &site_placement[place];
site_placement[place].first_ten_rank = 1;
sort_rank();
}