因此,我正在尝试从此url中抓取。您可以检查它的很多细节,这些细节在div下,其类为main_container。但是,每当我尝试刮擦时,它都不会在汤中加入。
<div class="main_container o-hidden" id="tfullview">
因此,我进行了研究,发现可能有两种方法:
因此此代码显示“无”表示未找到标签。
此div也具有o-hidden属性,这会阻止加载吗? 这是div:
pyqt的代码:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
import bs4 as bs
import requests
class Client(QWebPage):
def __init__(self,url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def on_page_load(self):
self.app.quit()
url = 'https://eprocure.gov.in/cppp/tendersfullview/MjMyODQwA13h1OGQ2NzAxYTMwZTJhNTIxMGNiNmEwM2EzNmNhYWZhODk=A13h1OGQ2NzAxYTMwZTJhNTIxMGNiNmEwM2EzNmNhYWZhODk=A13h1MTU1MzU4MDQwNQ==A13h1NzIxMTUvODUwOCA4NTA5LzE4L0NPVy9PV0M=A13h1MjAxOV9JSFFfNDU4NjEzXzE='
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source,'lxml')
test = soup.find("div",class_="main_container")
print(test)
答案 0 :(得分:1)
因此,鼓励使用requests
进行重写。 Session
是必需的,以允许以后使用列表中的链接。您可以轻松地适应循环遍历allLinks
中的所有URL。我显示第一个。
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://eprocure.gov.in/cppp/latestactivetendersnew/cpppdata?page=1'
with requests.Session() as s:
r = s.get(url)
soup = bs(r.content, 'lxml')
## all table links to individual tenders
titles, allLinks = zip(*[(item.text, item['href']) for item in soup.select('td:nth-of-type(5) a')])
r = s.get(allLinks[0]) #choose first link from table
soup = bs(r.content, 'lxml')
# container = soup.select_one('#tender_full_view')
tables = pd.read_html(r.content)
for table in tables:
print(table.fillna(''))
如果可以选择硒,则可以执行以下操作以收集从第1页着陆点到标书的所有链接。然后,您可以索引到URL列表中以进行任何单独的投标。我也会收集链接标题,以防您要搜索该链接标题,然后使用该索引。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
d = webdriver.Chrome()
url = 'https://eprocure.gov.in/cppp/latestactivetendersnew/cpppdata?page=1'
d.get(url)
## all table links to individual tenders
titles, allLinks = zip(*[(item.text, item.get_attribute('href')) for item in WebDriverWait(d,5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td:nth-of-type(5) a')))])
d.get(allLinks[0]) #choose first link from table
container = WebDriverWait(d,5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#tender_full_view')))
html = container.get_attribute('innerHTML')
tables = pd.read_html(html)
for table in tables:
print(table.fillna(''))
答案 1 :(得分:0)
我已经使用requests
和lxml
为您编写了一个快速工作的示例,不需要selenium
。
import requests
import lxml.html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
_session = requests.Session()
_session.headers.update(headers)
latest_tender_url = "https://eprocure.gov.in/cppp/latestactivetendersnew/cpppdata?page=1"
resp = _session.get(latest_tender_url)
xml = lxml.html.fromstring(resp.content)
tender_urls = xml.xpath('//a[contains(@href, "tendersfullview")]//@href')
for url in tender_urls:
t_resp = _session.get(url)
t_xml = lxml.html.fromstring(t_resp.content)
details = t_xml.xpath('//td[@id="tenderDetailDivTd"]')
[print(elm.text_content()) for elm in details]