我编写了一个python 3脚本,输出格式化的XML RSS输出。但是当我在Chrome中打开输出xml文件时,我看到Newlines不存在。这是我的代码:
import requests
import csv
import re
import math
from babel.numbers import format_decimal
from lxml import html, etree
from rfeed import *
class Scraper:
def __init__(self,url):
self.session = requests
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Host': 'www.fpds.gov',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
self.url = url
def make_requests(self):
r = self.session.get(self.url,headers=self.headers)
with open('sample.html','w',newline='',encoding='utf-8')as export:
export.write(r.text.replace('ns1:','ns1').lower())
return r.text
def amount_correction(self,source):
return '$'+format_decimal(math.trunc(float(source)), locale='en_US')
def open_existing(self):
return list(set([each.strip() for each in open('existing.csv')]))
def parse_xml(self):
Items = []
Existing_list = self.open_existing(); Existing_list_1 = []
source = self.make_requests()
sections = re.findall('<entry>(.+?)</entry>',source,re.DOTALL)
for section in sections:
x = lambda x: html.fromstring(section.replace('ns1:','ns1')).xpath(x)[0].strip()
y = lambda y: re.findall(y,section,re.DOTALL)
#fundingRequestingOfficeID = x("//ns1fundingrequestingofficeid//text()")
fundingRequestingOfficeNAME = x("//ns1fundingrequestingofficeid//@name")
#placeOfPerformanceZIPCode = x('//ns1placeofperformancezipcode//text()')
placeOfPerformanceCITY = x('//ns1placeofperformancezipcode//@city')
modified = x('//modified//text()')
vendorName = x('//ns1vendorname//text()')
obligatedAmount = self.amount_correction(x('//ns1obligatedamount//text()'))
href = x('//link[@rel="alternate"]//@href')+"&s=FPDS&templateName=1.4.4&indexName=awardfull&sortBy=SIGNED_DATE&desc=Y"
effectiveDate = x('//ns1effectivedate//text()').split(' ')[0]
fundingRequestingOfficeID = x('//ns1fundingrequestingofficeid//text()')
fundingRequestingOfficename = x('//ns1fundingrequestingofficeid//@name')
descriptionOfContractRequirement = x('//ns1descriptionofcontractrequirement//text()')
contractingofficeagencyid = x('//ns1contractingofficeagencyid//@name')
#contractActionType = x('//ns1agencyid//text()')
#typeOfContractPricing = x('//ns1typeofcontractpricing//text()')
#extentCompeted = x('//ns1extentcompeted//text()')
extentdescription = x('//ns1extentcompeted//@description')
#productOrServiceCode = x('//ns1productorservicecode//text()')
productOrServiceCodedescription = x('//ns1productorservicecode//@description')
#principalNAICSCode = x('//ns1principalnaicscode//text()')
principalNAICSCodedescription = x('//ns1principalnaicscode//@description')
currentCompletionDate = x('//ns1currentcompletiondate//text()').split(' ')[0]
ultimateCompletionDate = x('//ns1ultimatecompletiondate//text()').split(' ')[0]
totalBaseAndExercisedOptionsValue = self.amount_correction(x('//ns1totalbaseandexercisedoptionsvalue//text()'))
totalBaseAndAllOptionsValue = self.amount_correction(x('//ns1totalbaseandalloptionsvalue//text()'))
inherentlyGovernmentalFunction = x('//ns1inherentlygovernmentalfunction//text()')
inherentlyGovernmentalFunctiondescription = x('//ns1inherentlygovernmentalfunction//@description')
GFE_GFP = x('//ns1gfe-gfp//text()')
try:
multiYearContract = x('//ns1multiyearcontract//text()')
except:
multiYearContract = None
#serviceContractAct = x('//ns1servicecontractact//text()')
serviceContractActdescription = x('//ns1servicecontractact//@description')
vendorName = x('//ns1vendorname//text()')
vendorLocationcity = x('//ns1vendorlocation//ns1city//text()')
vendorLocationstate = x('//ns1vendorlocation//ns1state//text()')
annualRevenue = self.amount_correction(x('//ns1annualrevenue//text()'))
numberOfEmployees = x('//ns1numberofemployees//text()')
vendorLocationphone = x('//ns1vendorlocation//ns1phoneno//text()')
isForProfitOrganization = x('//ns1isforprofitorganization//text()')
contractingOfficerBusinessSizeDetermination = x('//ns1contractingofficerbusinesssizedetermination//@description')
Title = "{} in {} – {} to {}".format(fundingRequestingOfficeNAME,placeOfPerformanceCITY,obligatedAmount,vendorName)
Link = "{}".format(href)
Description = """Effective {}, {} has obligated {} for “{}” to be performed in {}.
\n\n
This is a {} ({}) for {} under the NAICS Code {}.
\n\n
The current and ultimate completion dates are {} and {} respectively.
The total value of base and exercised options is {} out of {}.
\n\n
Inherently Governmental? {} {}
\n
Government-furnished? {}
\n
Multiyear? {}
\n
Service Contact Act? {}
\n\n
The vendor is {} in {}, {}.
\n
Revenue: {}
\n
Employees: {}
\n
Phone: {}
\n\n
For Profit? {}
\n
Status: {}
""".format(effectiveDate,fundingRequestingOfficeNAME,obligatedAmount,descriptionOfContractRequirement,placeOfPerformanceCITY,
contractingofficeagencyid,extentdescription,productOrServiceCodedescription,principalNAICSCodedescription,
currentCompletionDate,ultimateCompletionDate,totalBaseAndExercisedOptionsValue,totalBaseAndAllOptionsValue,
inherentlyGovernmentalFunction,inherentlyGovernmentalFunctiondescription,GFE_GFP,multiYearContract,
serviceContractActdescription,vendorName,vendorLocationcity,vendorLocationstate,annualRevenue,numberOfEmployees,
vendorLocationphone,isForProfitOrganization,contractingOfficerBusinessSizeDetermination)
#Description = Description.replace('<','<').replace('>','>')
to_check = x('//ns1piid//text()')+x('//ns1signeddate//text()').split(' ')[0].strip()
if to_check not in Existing_list:
Items.append(Item(title = Title,link = Link,description = Description))
Existing_list_1.append(to_check)
if Items != []:
feed = Feed(title = "RSS FEEDS",
link = "http://167.99.192.145/fpds_rss_feed.xml",
description = "Customized RSS Feed",
generator = "Shekhar Samanta",
items = Items)
with open("fpds_rss_feed.xml","w",encoding="utf-8")as export:
export.write(feed.rss())
with open('existing.csv','a',newline='')as export1:
writer = csv.writer(export1)
for Existing_one in Existing_list_1:
if Existing_one not in Existing_list:
writer.writerow([Existing_one])
fpds = Scraper('https://www.fpds.gov/ezsearch/fpdsportal?s=FPDSNG.COM&indexName=awardfull&templateName=1.4.4&q=OBLIGATED_AMOUNT%3A%5B50000%2C%29+AND+PRINCIPAL_NAICS_CODE%3A%28541618+OR+541690+OR+541820+OR+541910+OR+541990+OR+561110+OR+561499+OR+561611+OR+561990+OR+921190+OR+922190+OR+923110+OR+923130+OR+928110+OR+928120%29+AND+AGENCY_CODE%3A%280559+OR+1100+OR+1145+OR+1153+OR+1204+OR+1301+OR+1544+OR+1549+OR+1550+OR+1900+OR+3400+OR+7003+OR+7009+OR+7022+OR+7100+OR+7200+OR+7505+OR+7523+OR+8000+OR+8300+OR+8900+OR+9543+OR+9577%29+PRODUCT_OR_SERVICE_CODE%3A%28+AJ96+OR+B506+OR+B507+OR+B522+OR+B544+OR+B548+OR+B549+OR+B550+OR+B551+OR+R405+OR+R406+OR+R407+OR+R408+OR+R409+OR+R412+OR+R419+OR+R422+OR+R423+OR+R426+OR+R499+OR+R699+OR+R706+OR+R707+OR+R708+OR+R799%29+AND+SIGNED_DATE%3A%5B2018%2F03%2F01%2C%29+AND+POP_STATE_NAME%3A%28%22VIRGINIA%22+OR+%22MARYLAND%22+OR+%22DISTRICT+OF+COLUMBIA%22%29+-%22DOMESTIC+AWARDEES+%28UNDISCLOSED%29%22+-%22FOREIGN+AWARDEES%22&rss=1&feed=atom0.3')
source = fpds.parse_xml()
我尝试使用不同的HTML标签让换行符可以被Chrome接受,但它不起作用。它的Description标记具有换行符\#39; \ n但它不起作用
您需要在同一目录中使用空白CSV文件&#34; existing.csv&#34;