我正在使用Wikipedia的API抓取挪威语的内容,将其清理并将其写入文件中,以训练CMU Sphinx的语言模型。
在for循环中运行.random-function时,但是我遇到了一个问题。我正在通过pageId计算唯一页面的数量,并且得到了大量重复项。刚开始的时候不是很多,但是过了一会儿,重复项的数量是唯一ID数量的两倍。当我们有40页时,我们大约有80个重复项。
我们确实没有看到有关.random-function的信息吗?
这是代码。 regEx具有用于更轻松地读取过滤器顺序的功能。
import re
import wikipedia
"""
Module wikitest.py - Script for scraping Wikipedia of text based on articles
found by using wikipedia.random.
Used for gathering and formatting written text representative of the
Norwegian language,
for use in training language models.
"""
# Create regex to filter the results
specialcharreg = re.compile(r'[^A-Za-zÆØÅæøå0-9.,-]+', re.IGNORECASE)
whitespacereg = re.compile(r' {2}', re.IGNORECASE)
punctuationreg = re.compile(r'[.]+', re.IGNORECASE)
shortsentencereg = re.compile(r'(</?s>)([a-zæøåA-ZÆØÅ0-9,\- ]{0,50})(</?
s>)', re.IGNORECASE)
isbnreg = re.compile(r'(ISBN)([0-9- ]{7,21})', re.IGNORECASE)
nospaceaftertagreg = re.compile(r'(<s>([a-zæøåA-ZÆØÅ,-]))', re.IGNORECASE)
# filter-methods for formatting the text
def nospeacialchar(wikicontent): return re.sub(specialcharreg, ' ',
wikicontent)
def nodoublewhitespace(wikicontent): return re.sub(whitespacereg, ' ',
wikicontent)
def faultysentence(wikicontent): return re.sub(shortsentencereg, '',
wikicontent)
def inserttags(wikicontent): return re.sub(punctuationreg, ' </s>\n<s>', wikicontent)
def noemptylines(wikicontent): return "".join([s for s in wikicontent.splitlines(True) if s.strip("\r\n")])
def noisbn(wikicontent): return re.sub(isbnreg, '', wikicontent)
def nospaceaftertag(wikicontent): return re.sub(nospaceaftertagreg, '<s> ', wikicontent)
# We only want articles written in Norwegian
wikipedia.set_lang("no")
# initialize different counters for counting duplicates and uniques
idlist = []
duplicatecount = 0
uniquecount: int = 0
showuniquecount = 0
# define number of pages to get
for x in range(0, 10001):
try:
randompages = wikipedia.random(1)
for page in randompages:
# get wikipedia page
wikipage = wikipedia.page(page)
# get page ID
pageid = wikipage.pageid
# check for ID-duplicate
if pageid not in idlist:
# add ID to list of gotten pages
idlist.append(pageid)
uniquecount += 1
showuniquecount += 1
# on every tenth iteration, print current unique count
if showuniquecount == 10:
print("Current unique page count:{0}".format(uniquecount))
showuniquecount = 0
wikicontent = wikipage.content
# filter the content using different regex-functions
filteredcontent = \
faultysentence(
noemptylines(
nospaceaftertag(
faultysentence(
inserttags(
nodoublewhitespace(
noisbn(
nospeacialchar(
wikicontent))))))))
print(filteredcontent)
# Write operation to file
with open("wikiscraping2.txt", "a", encoding="utf-8") as the_file:
the_file.write('<s> ' + filteredcontent)
the_file.close()
else:
duplicatecount += 1
print("Duplicate! Current duplicate count:{0}".format(duplicatecount))
# catch exception of wikipedia not knowing which page is specified
except wikipedia.DisambiguationError as e:
print('DisambiguationError!')
# continue to next
continue
# catch exception
except wikipedia.exceptions.PageError as d:
print('Index error! (Page could not be found)')
# continue to next
continue