我想直接将网页报废数据导入PostgreSQL,而不是首先将其导出到.csv。
以下是我使用的代码,将数据导出到.csv文件,然后我手动导入它。任何帮助将不胜感激
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://tis.nhai.gov.in/TollInformation?TollPlazaID=236'
uClient = uReq(my_url)
page1_html = uClient.read()
uClient.close()
#html parsing
page1_soup = soup(page1_html,"html.parser")
filename = "TollDetail12.csv"
f = open(filename,"w")
headers = "ID, tollname, location, highwayNumber\n"
f.write(headers)
#grabing data
containers = page1_soup.findAll("div",{"class":"PA15"})
for container in containers:
toll_name = container.p.b.text
search1 = container.findAll('b')
highway_number = search1[1].text
location = list(container.p.descendants)[10]
ID = my_url[my_url.find("?"):]
mystr = ID.strip("?")
print("ID: " + mystr)
print("toll_name: " + toll_name)
print("location: " + location)
print("highway_number: " + highway_number)
f.write(mystr + "," + toll_name + "," + location + "," + highway_number.replace(",","|") + "\n")
f.close()
答案 0 :(得分:0)
您需要安装psycopg2
pip包。除此之外,使用您的项目特定信息编辑文件,尚未测试但应该有效。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import psycopg2
my_url = 'http://tis.nhai.gov.in/TollInformation?TollPlazaID=236'
uClient = uReq(my_url)
page1_html = uClient.read()
uClient.close()
# html parsing
page1_soup = soup(page1_html, 'html.parser')
# grabing data
containers = page1_soup.findAll('div', {'class': 'PA15'})
# Make the connection to PostgreSQL
conn = psycopg2.connect(database='database_name',
user='user_name', password='user_password', port=5432)
cursor = conn.cursor()
for container in containers:
toll_name = container.p.b.text
search1 = container.findAll('b')
highway_number = search1[1].text
location = list(container.p.descendants)[10]
ID = my_url[my_url.find('?'):]
mystr = ID.strip('?')
query = "INSERT INTO table_name (ID, toll_name, location, highway_number) VALUES (%s, %s, %s, %s);"
data = (ID, toll_name, location, highway_number)
cursor.execute(query, data)
# Commit the transaction
conn.commit()