我正在开发一个我的项目,现在我需要标记一些文本数据。数据来自这个网站:http://www.achemenet.com/,其中包含数十种文本(有些文本被破坏)。 例如,这是典型的文本网址:
内部的文字看起来像那样:
zú-lum-ma šá ina é níg-ga šá gú i7 zimbirki
2 a-na iti še u4 22-kam mu sag Ida-a-ru-eš-šú
lugal tin-tirki lugal kur-kur
——————————————————
4 40 gur ⌈zú-lum-ma⌉-šú Id30-šeš-mu
——————————————————
15 gur ri-ih-tu4 zú-lum-ma-šú ta
6 ⌈é⌉ [ x I]ìr-ia lú mi-ṣir-a-a e-ṭir
——————————————————
70 ma-ši-hi šá ⌈sat?-tuk?⌉ zú-lum-ma i-na
8 pap-pa-su lú mu-ú-tu a-na
Idnà-a-mu sì-na
——————————————————
10 6 gur zú-lum-ma ina pap-pa-su šá é dIM
a-na Idutu-su a Imu-šeb-!ši-dšú sì-na
——————————————————
这些线代表文本中的一个部分,我也需要对它们进行标记(而不是忽略它们)。
我确实需要忽略行号并为所有号码提供共享令牌。
'?'表示我们不确定写得正确的单词,'⌈'表示文本有点分解,所以我也可以忽略这些字符。
以下代码将获取所有需要的URL并将其下载到.txt文件中:(用python 2.7编写)
choose = input("How would you like to see the procces?\n1 = by % \n2 = printing the text \n")
# creating the path and the place the docs will be on your computer
newpath = r'./babylonienTexts2'
if not os.path.exists(newpath):
os.makedirs(newpath)
else:
print ("you already got the files or files with the same name on your computer")
helper = input("Are you sure you want to replace them?\n1 - yes I want to replace them\n2 - no I don't want to replace them\n")
if helper != 1:
exit()
myPageUrl = []
text_file = open(r"./babylonienTexts/textsUrls.txt", "w")
print(r"urls download: (to ./babylonienTexts/textsUrls.txt)")
prhelp = 1.0
# copy the urls and printing em to a txt file and to the screen
names = ["archives-ebabbar","archives-eanna","archives-egibi","archives-nappahu","archives-murasu","autres-archives-privees"]
for y in range(0,6):
for x in range(1,2): # 8 because the are 8 pages in the site
link = "http://www.achemenet.com/fr/tree/?/sources-textuelles/textes-par-langues-et-ecritures/babylonien/%s/%s/96/0#set" % (names[y], x)
pagecode = urllib.urlopen(link)
for x in range(0,10000): # 10000 because the page code probably wont be longer then that
myfile = pagecode.readline()
if (myfile.find("Strassmaier") >= 0 or myfile.find("YOS") >= 0 or myfile.find("Wunsch") >= 0 or myfile.find("Murašu") >= 0 or myfile.find("Jursa") >= 0):
myPageUrl.append("http://www.achemenet.com/" + myfile.split('"')[3])
text_file.write("http://www.achemenet.com/" + myfile.split('"')[3] + "\n")
if choose == 1:
b = str(round((prhelp / 1834) * 100,3))
print('\r',end = "")
print(b + "%",end = "")
time.sleep(0.01)
prhelp = prhelp + 1
else:
print ("http://www.achemenet.com/" + myfile.split('"')[3])
print ("\nfinised download the urls")
text_file.close()
我创建了一个遍历所有URL的代码,在网站的html代码中查找文本,并尝试对其进行标记,但它只是不起作用。这种非ascii字符和令人困惑,难以找到文本的组合对我来说是不可能的。
我确实设法为普通文本创建了一个标记化系统(用python3编写):
import argparse
from os import walk
from pathlib import Path
import operator
parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
parser.add_argument('--dataLocation', type=str, default='./data/penn',
help='location of the data to analyze')
parser.add_argument('--logFile', type=str, default='./data/log.txt',
help='location of the log file')
parser.add_argument('--ignore', type=str, default='',
help='words to ignore')
parser.add_argument('--ignoreStartingWith', type=str, default='http',
help='words to ignore that starts with')
args = parser.parse_args()
targateFiles = []
try:
for (dirpath, dirnames, filenames) in walk(args.dataLocation):
targateFiles.extend(filenames)
break
except Exception as e:
print("Error trying to read the files from " + args.dataLocation)
print("--- Error: \n" + e)
print("Exiting")
exit(1)
my_file = Path("./data/log.txt")
if my_file.is_file():
userInput = input("./data/log.txt already exist, are you sure you want to replace it? y or n \n")
if not (userInput == "yes" or userInput == 'y'):
print("Exiting")
exit(0)
# A word object for help
class Word(object):
def _init_(self):
self.number = 0
self.numberOfAppear = 0
self.dict = {}
self.numberAfter = 0
def create(self,number,dict):
self.number = number
self.numberOfAppear = 1
self.dict = dict
def appear(self):
self.numberOfAppear += 1
# A dictionary object for help
class Dictionary(object):
def _init_(self):
self.word2idx = {}
#self.idx2word = []
def predict(self,word):
return max((self.word2idx[word].dict).iteritems(), key=operator.itemgetter(1))[0]
def add_word(self, word):
if word not in self.word2idx:
new = Word()
new.create(len(self.word2idx) + 1,{})
#self.idx2word.append(new)
self.word2idx[word] = new
else:
self.word2idx[word].numberOfAppear += 1
#self.idx2word[self.findIdxByWord(word)].numberOfAppear += 1
return self.word2idx[word]
def add_next(self, worda, wordb):
if wordb not in (self.word2idx[worda]).dict:
(self.word2idx[worda]).dict[wordb] = 1
else:
(self.word2idx[worda]).dict[wordb] += 1
def findIdxByWord(self, word):
return self.word2idx[word].number - 1
def _len_(self):
return len(self.word2idx)
globalDictionery = Dictionary()
totalNumberOfWords = 0
def dictoneryFile(file):
counter = 0
tFile = open(args.dataLocation + '/' + file, 'r')
for line in tFile:
for word in line.split():
if word.startswith(args.ignoreStartingWith):
continue
globalDictionery.add_word(word)
counter += 1
tFile.close()
return counter
if len(targateFiles) > 1:
userInput = input("Multiple files founded. Continue? y or n.\n")
if not (userInput == "yes" or userInput == 'y'):
print("Exiting")
exit(0)
print("Finding all the words and put in first dictionary")
for file in targateFiles:
print("Reading from file: " + file)
totalNumberOfWords += dictoneryFile(file)
print("Finding all the words and put in upgraded dictionary")
for file in targateFiles:
print("Reading from file: " + file)
tFile = open(args.dataLocation + '/' + file, 'r')
for line in tFile:
line = line.split()
for i, k in zip(line,line[1:]):
globalDictionery.add_next(i, k)
tFile.close()
def prettyPrint(dict,file):
for el in dict:
file.write("** " + el + ": " + str(dict[el].number) + " " + str(dict[el].numberOfAppear) + "\n")
for sub in dict[el].dict:
file.write("--- " + sub + ": " + str(dict[el].dict[sub]) + "\n")
print("Printing to summarized data to ./data/log.txt")
logFile = open(args.logFile, 'w')
logFile.write("Dictionary length: " + str(len(globalDictionery.word2idx)) + "\n")
logFile.write("Total number of words: " + str(totalNumberOfWords) + "\n")
prettyPrint(globalDictionery.word2idx,logFile)
logFile.close()
print("Analyze done")
现在,问题是:您是否知道我能够正确地阅读,标记和分析这些文本的方式,以便标记化过程能够按照我想要的方式工作? 如果答案是在python x.2或x.3中,那对我来说无关紧要。