在我开始之前,请道歉,这是我第三次尝试解决我的问题。最后两个问题似乎遭遇了通信故障。我使用以下Scrapy代码:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
import re
import json
class ExampleSpider(CrawlSpider):
name = "mrcrawl2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('/Seasons'),deny=('/News', '/Fixtures', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', '/Players', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
sel = Selector(response)
regex = re.compile('DataStore\.prime\(\'history\', { stageId: \d+ },\[\[.*?\]\]?\)?;', re.S) #use regex to find none html data on page
match2g = re.search(regex, response.body)
if match2g is not None:
match3g = match2g.group()
match3g = str(match3g)
match3g = match3g.replace("'", '').replace("'", '').replace('[', '').replace(']', '').replace('] );', '') #replace some characters from returned string
match3g = re.sub("DataStore\.prime\(history, { stageId: \d+ },", '', match3g) #here replacing also
match3g = match3g.replace(');', '') #and here
new_match3g = '' #create empty variable
for line in match3g.split("\n"): #for each line of old string
upl = line.rsplit(",",1)[1:] #split at the last comma
if upl:
upl1 = "{}".format("".join(list(upl[0]))) #new data format
upl2 = "{}".format(",".join(list(upl[0]))) #old data format
upl2 = str(upl2) #convert both to strings
upl1 = str(upl1)
new_match3g += line.replace(upl1, upl2) + '\n' #replace old substring with new one in new string created from old
print "UPL1 = ", upl1 #print new and old substrings to confirm conversion completed correctly
print "UPL2 = ", upl2
print new_match3g.decode('utf-8') #print new and old strings to confirm the new string has been built correctly
print match3g.decode('utf-8')
execute(['scrapy','crawl','mrcrawl2'])
这样做的目的是在解析每行数据的最后一个逗号后取所有数字,并用逗号分隔这些数字。
转换的一个例子是:
,000
至,0,0,0,
完成此过程后,我将创建一个新的空变量' new_match3g'然后用旧的变量“匹配3”来逐行填充,只用我新的逗号分隔的字符串替换旧的非逗号分隔符号。
然后我打印旧字符串和新字符串以及变换前后字符串,以观察它是否正常工作。
对于大多数示例而言,无论如何看似随机,生成的某些子字符串不会添加到' new_match3g'正确地说,即使它们已经在变量' upl2'中正确转换。
如果您要自己运行此代码并观察一些输出,您会看到我的意思。我不明白为什么这只是在某些方面发生,似乎是随机的。
由于
答案 0 :(得分:1)
s = "foo,bar,foo,foobar"
spl = s.rsplit(",",1)
to_be_updated = spl[1:]
updated = ",".join(to_be_updated[0])
orig = spl[0:1]
final = orig[0] +","+ updated
foo,bar,foo,f,o,o,b,a,r