对两个文件中的两列标题进行标题匹配。标题可以看作字符串,文件A 162283行X 12列。文件B 3695行X 6列。我使用了levenshtein算法。对于文件B中第4列的每一行,计算它与文件A中第5列中每一行的相似度,找到具有最高相似度的A中的标题,将标题附加到文件B中的相应行,并且文件A中的ID。
在我计算相似度之前,我删除了字符串中的一些符号和单词,例如":"," - "," season", "插曲&#34 ;.而这样简单的编程,对于如此大小的数据,花了200多分钟。我想知道为什么。
我第一次写了一个python程序,花了很长时间,然后我写了一个c ++程序,花了更长的时间。为什么呢?
请参阅以下程序:
蟒:
import csv
import re
import difflib
import operator
import Levenshtein
import datetime
import glob
import os
import fnmatch
a=[]
with open("D:\\A.txt","rb") as f:
for row in f:
a.append(row.split("\t"))
f.close()
b=[]
with open("B.txt","rb") as k:
for row in k:
b.append(row.split("\t"))
k.close()
dd={}
ee={}
my_list=[]
for i in range(len(a)):
ff={}
# max_value=0
for j in range(len(b)):
s1=re.sub(r',',' ',a[i][3])
s1=s1.lower()
s2=re.sub(r',',' ',b[j][4])
s2=s2.lower()
s1=re.sub(r'series',' ',s1)
s1=re.sub(r'episode',' ',s1)
s2=re.sub(r'series',' ',s2)
s2=re.sub(r'episode',' ',s2)
s1=re.sub(r'season',' ',s1)
s2=re.sub(r'season',' ',s2)
s1=re.sub(r'"',' ',s1)
s2=re.sub(r'"',' ',s2)
s1=re.sub(r'-',' ',s1)
s2=re.sub(r'-',' ',s2)
s2=re.sub(r':',' ',s2)
s1=re.sub(r':',' ',s1)
s1=re.sub(r' ','',s1)
s2=re.sub(r' ','',s2)
d=float(Levenshtein.ratio(s1,s2))
ff[b[j][4]+"\t"+str(b[j][11])]=d
# max_value=float(max(max_value,d))
qq="\t".join(a[i])
dd[qq]=max(ff.iteritems(),key=operator.itemgetter(1))[0]
my_list.append([qq.strip()+"\t"+dd[qq]])
datestr=datetime.date.today().strftime("%y%m%d")
filename="good2_codes_{}".format(datestr)+'.txt'
File=open("C”+filename,'w')
for item in my_list:
File.write(str(item[0])+"\n")
File.close()
C ++:
#include <string>
#include<iostream>
#include <algorithm>
#include <fstream>
#include <boost/unordered_map.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <vector>
#include <boost/algorithm/string/replace.hpp>
using namespace std;
size_t uiLevenshteinDistance (const std::string &s1, const std::string &s2)
{ const size_t m(s1.size());
const size_t n(s2.size());
if(m==0) return n;
if(n==0) return m;
size_t *costs=new size_t[n+1];
for(size_t k=0;k<=n;k++) costs[k]=k;
size_t i=0;
for (std::string::const_iterator it1=s1.begin(); it1!=s1.end();++it1,++i)
{costs[0]=i+1;
size_t corner=i;
size_t j=0;
for(std::string::const_iterator it2=s2.begin();it2!=s2.end();++it2,++j)
{
size_t upper=costs[j+1];
if(*it1==*it2)
{
costs[j+1]=corner;
}
else
{ size_t t(upper<corner?upper:corner);
costs[j+1]=(costs[j]<t?costs[j]:t)+1;
}
corner=upper;
}
}
size_t result=costs[n];
delete [] costs;
return result;
}
int main()
{
std::vector<std::string> lines;
std::ifstream file("A.txt");
std::string line;
while (std::getline(file,line)) {
lines.push_back(line);
}
std::vector<std::string> foxs;
std::ifstream file1("B.txt");
std::string fox;
while (std::getline(file1,fox)) {
foxs.push_back(fox);
}
boost::unordered_map<std::string, std::string> hashtable1;
for (int i=0; i< (int) lines.size(); i++)
{ boost::unordered_map<std::string, float> hashtable;
for (int j=0; j<(int) foxs.size(); j++)
{
std::string str=lines[i];
std::vector<std::string> tokens;
boost::split(tokens,str,boost::algorithm::is_any_of("\t"));
std::string str1=foxs[j];
std::vector<std::string> tokens1;
boost::split(tokens1,str1,boost::algorithm::is_any_of("\t"));
std::string s1=tokens[3];
std::string s2=tokens1[4];
boost::algorithm::to_lower(s1);
boost::algorithm::to_lower(s2);
boost::replace_all(s1,",","");
boost::replace_all(s2,",","");
boost::replace_all(s1,"-","");
boost::replace_all(s2,"-","");
boost::replace_all(s1,"season","");
boost::replace_all(s2,"season","");
boost::replace_all(s1,"episode","");
boost::replace_all(s2,"episode","");
boost::replace_all(s1,"series","");
boost::replace_all(s2,"series","");
// size_t f = s1.find(",");
// s1.replace(f, std::string(",").length(),"");
// size_t f1=s2.find(",");
// s2.replace(f1, std::string(",").length(),"");
// size_t f2 = s1.find("season");
// s1.replace(f2, std::string("season").length(),"");
// size_t f3=s2.find("season");
// s2.replace(f3, std::string(",").length(),"");
// size_t f4 = s1.find("episode");
// s1.replace(f4, std::string("episode").length(),"");
// size_t f5=s2.find("episode");
// s2.replace(f5, std::string("episode").length(),"");
// size_t f6 = s1.find("series");
// s1.replace(f6, std::string("series").length(),"");
// size_t f7=s2.find("series");
// s2.replace(f7, std::string("series").length(),"");
s1.erase(remove( s1.begin(), s1.end(), '\"' ),s1.end());
s2.erase(remove( s2.begin(), s2.end(), '\"' ),s2.end());
//size_t f10 = s1.find("-");
// s1.replace(f10, std::string("-").length(),"");
// size_t f11=s2.find("-");
// s2.replace(f11, std::string("-").length(),"");
boost::replace_all(s1," ","");
boost::replace_all(s2," ","");
float k,k2,k3;
k=float (std::max(s1.size(),s2.size()));
k2=float ( uiLevenshteinDistance(s1,s2));
k3=1-k2/k;
hashtable.insert(make_pair(tokens1[4]+"\t"+(std::string)tokens1[11],k3));
}
float max=0;
std::string max_key;
for (auto itr=hashtable.begin(); itr !=hashtable.end(); itr++)
{
if ((*itr).second>max)
{
max=(*itr).second;
max_key=(*itr).first;
}
}
hashtable1.insert(make_pair(lines[i],max_key));
}
for (auto itr1=hashtable1.begin(); itr1 !=hashtable1.end(); itr1++)
cout << (*itr1).first << "\t" << (*itr1).second << endl;
return 0;
}
答案 0 :(得分:1)
因为您正在进行len(a) * len(b)
编辑距离计算。 Levenshtein编辑距离不是您用于此类匹配的工具;尽量减少设置的问题;将字符串规范化为小写,删除标点符号,拆分为标记 - 使用一些词干算法,如Porter,Snowball;之后,您可以过滤掉不共享任何或大量单词的对;只有当你明显减少问题时,你才能使用Levenshtein。
Levenshtein
python模块可以与C ++实现竞争的原因是Python模块是用C语言编写的。
答案 1 :(得分:0)
#include <string>
#include<iostream>
#include <algorithm>
#include <fstream>
#include <boost/unordered_map.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <vector>
#include <boost/algorithm/string/replace.hpp>
using namespace std;
size_t uiLevenshteinDistance(const std::string &s1, const std::string &s2)
{
const size_t m(s1.size());
const size_t n(s2.size());
if(m == 0) return n;
if(n == 0) return m;
size_t *costs = new size_t[n + 1];
for(size_t k = 0; k <= n; k++) costs[k] = k;
size_t i = 0;
for(std::string::const_iterator it1 = s1.begin(); it1 != s1.end(); ++it1, ++i)
{
costs[0] = i + 1;
size_t corner = i;
size_t j = 0;
for(std::string::const_iterator it2 = s2.begin(); it2 != s2.end(); ++it2, ++j)
{
size_t upper = costs[j + 1];
if(*it1 == *it2)
{
costs[j + 1] = corner;
}
else
{
size_t t(upper<corner ? upper : corner);
costs[j + 1] = (costs[j]<t ? costs[j] : t) + 1;
}
corner = upper;
}
}
size_t result = costs[n];
delete[] costs;
return result;
}
int main()
{
std::vector<std::string> lines;
std::ifstream file("A.txt");
std::string line;
while(std::getline(file, line)) {
lines.push_back(line);
}
std::vector<std::string> foxs;
std::ifstream file1("B.txt");
std::string fox;
while(std::getline(file1, fox)) {
foxs.push_back(fox);
}
boost::unordered_map<std::string, std::string> hashtable1;
std::vector<std::string> tokens;
std::vector<std::string> s1s;
for(int i = 0; i < (int)lines.size(); i++)
{
std::string str = lines[i];
boost::split(tokens, str, boost::algorithm::is_any_of("\t"));
std::string s1 = tokens[3];
boost::algorithm::to_lower(s1);
boost::replace_all(s1, ",", "");
boost::replace_all(s1, "-", "");
boost::replace_all(s1, "season", "");
boost::replace_all(s1, "episode", "");
boost::replace_all(s1, "series", "");
s1.erase(remove(s1.begin(), s1.end(), '\"'), s1.end());
boost::replace_all(s1, " ", "");
s1s.push_back(s1);
}
std::vector<std::string> tokens1;
std::vector<std::string> s2s;
for(int j = 0; j < (int)foxs.size(); j++)
{
std::string str1 = foxs[j];
boost::split(tokens1, str1, boost::algorithm::is_any_of("\t"));
std::string s2 = tokens1[4];
boost::algorithm::to_lower(s2);
boost::replace_all(s2, ",", "");
boost::replace_all(s2, "-", "");
boost::replace_all(s2, "season", "");
boost::replace_all(s2, "episode", "");
boost::replace_all(s2, "series", "");
s2.erase(remove(s2.begin(), s2.end(), '\"'), s2.end());
boost::replace_all(s2, " ", "");
s2s.push_back(s2);
}
for(int i = 0; i< (int)lines.size(); i++)
{
boost::unordered_map<std::string, float> hashtable;
for(int j = 0; j<(int)foxs.size(); j++)
{
float k, k2, k3;
k = float(std::max(s1s[i].size(), s2s[j].size()));
k2 = float(uiLevenshteinDistance(s1s[i], s2s[j]));
k3 = 1 - k2 / k;
hashtable.insert(make_pair(tokens1[4] + "\t" + (std::string)tokens1[11], k3));
}
float max = 0;
std::string max_key;
for(auto itr = hashtable.begin(); itr != hashtable.end(); itr++)
{
if((*itr).second>max)
{
max = (*itr).second;
max_key = (*itr).first;
}
}
hashtable1.insert(make_pair(lines[i], max_key));
}
for(auto itr1 = hashtable1.begin(); itr1 != hashtable1.end(); itr1++)
cout << (*itr1).first << "\t" << (*itr1).second << endl;
return 0;
}