我的目的是全面审核所有个人资料以及评论标题,用户名,用户位置以及从依赖jio评论网站网页发布的时间,并将其存储在CSV中文件。
我想抓取的网站是http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
当我尝试将前两页的抓取数据存储在CSV文件中时,我得到了以下输出。 我的问题是每行中的输出生成的列多于所需的列数。一个句子被解析成许多单元格。
我的代码:
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items1 in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link1 = items1.find_element_by_css_selector(".reviewdata a")
link1.click()
time.sleep(2)
csv = open('index.csv','w')
column = "Name,Location,Review_data,Review_title,Review_data\n"
csv.write(column)
soup1 = BeautifulSoup(driver.page_source,"lxml")
for item1 in soup1.select(".review-article"):
name1 = item1.select("p a")[0].text
location1 = item1.select("p")[1].text
review_date1 = item1.select("small")[0].text
review_title1 = item1.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data1 = ' '.join([' '.join(items1.text.split()) for items1 in item1.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name1, location1, review_date1, review_title1, review_data1))
csv1 = open('index.csv','a')
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
uclient=uReq(url)
page_html=uclient.read()
uclient.close()
page_soup = soup(page_html,"html.parser")
container = soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
content1 = int(content)
container1 = soup.findAll("li",{"class":"next"})
li=container1[0].find("a",{"class":"btn btn-link"}).attrs['href']
driver.get(li)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, location, review_date, review_title, review_data))
csv2 = open("index.csv",'a')
page2_data = name +","+ location+"," + review_date +","+ review_title +","+ review_data + "\n"
csv2.write(page2_data)
driver.quit()
我需要帮助找出我的代码中的错误,以便以结构化的方式将抓取的数据存储到CSV文件中。
答案 0 :(得分:2)
在文本编辑器中查看您的csv文件。问题是您的电子表格程序正在解析逗号和空格。
另一个问题是你没有在你的抓取数据中占用逗号。这就是为什么你在不同的单元格中拥有城市和国家的原因。您需要在其中包含逗号的值周围加上引号。
答案 1 :(得分:1)
见
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
在位置:德里,印度已经使用了逗号。如果您像上面一样继续使用逗号,则无法正确解析csv文件。
**一种解决方法是在包含逗号的文本周围添加“”。所以印度德里将在这一步之后变成“\德里,印度”。 **
def preprocess(text):
if "," in text:
return '"' + text + '"'
return text
使用该功能包装每个文本。
page1_data = preprocess(name1) + "," + preprocess(location1) + "," + preprocess(review_date1) + "," + preprocess(review_title1) + "," + preprocess(review_data1) + "\n"
这应该有用。
另一种方法应该是将分隔符更改为其他字符。
答案 2 :(得分:1)
您应该使用csv
模块,因为它会自动解决逗号问题和"新行" /输入文本。
创建csv writer
f = open('index.csv','w')
csv_writer = csv.writer(f)
并使用list编写标题,而不是单个字符串
column = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(column)
以相同的方式写入带有数据的列表
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
完整代码
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- get next url ---
uclient = uReq(url)
page_html = uclient.read()
uclient.close()
soup = BeautifulSoup(page_html, "html.parser")
container = soup.find("ul", {"class": "pages table"})
all_li = container.findAll("li")
if all_li:
last_div = all_li[-1]
content = last_div.getText()
content = int(content)
container = soup.findAll("li", {"class": "next"})
li = container[0].find("a", {"class": "btn btn-link"}).attrs['href']
# ---- get data ---
driver.get(li)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- end ---
driver.quit()
f.close()
编辑版本没有beautifulsoup
和requests
- 仅selenium
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
def get_data(driver, csv_writer):
for item in driver.find_elements_by_css_selector(".review-article"):
name = item.find_elements_by_css_selector("p a")[0].text
location = item.find_elements_by_css_selector("p")[1].text
review_date = item.find_elements_by_css_selector("small")[0].text
review_title = item.find_elements_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = item.find_elements_by_css_selector(".reviewdata")
review_data = ' '.join([' '.join(items.text.split()) for items in review_data])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- get next url ---
url = driver.find_element_by_xpath('//li[@class="next"]/a').get_attribute("href")
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- end ---
driver.quit()
f.close()