自动抓取网站

时间:2019-02-26 05:19:20

标签: selenium beautifulsoup web-crawler

我从这里得到了帮助,使用以下代码爬到law.go.kr上。
我正在尝试抓取其他网站,例如http://lawbot.orghttp://law.go.krhttps://casenote.kr
但是问题是我对html不了解...
我了解所有代码以及如何获取下面代码的html地址,但在其他网站上却有所不同...
我想知道如何使用下面的代码来抓取其他网页。

import requests
from bs4 import BeautifulSoup

if __name__ == '__main__':

    # Using request get 50 items from first page. pg=1 is page number, outmax=50 items 
per page
    response = requests.post(
        "http://law.go.kr/precScListR.doq=*&section=evtNm&outmax=79329&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")

    # Parse html using BeautifulSoup
    page = BeautifulSoup(response.text, "html.parser")

    # Go through all pages and collect posts numbers in items
    items = []
    for i in range(1, 2):
        # Get all links
        links = page.select("#viewHeightDiv .s_tit a")
        # Loop all links and collect post numbers
        for link in links:
            # Parse post number from "onclick" attribute
            items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))

    # Open all posts and collect in posts dictionary with keys: number, url and text
    posts = []
    for item in items:
        url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
        response = requests.get(url)
        parsed = BeautifulSoup(response.text, "html.parser")
        text = parsed.find('div', attrs={'id': 'contentBody'}).text     #전문 저장 
'id': 'contentBody', 제목제외 저장 'class': 'pgroup'
        title = parsed.select_one("h2").text
        posts.append({'number': item, 'url': url, 'text': text, 'title': title})

        with open("D://\LAWGO_DATA/" + item + '.txt', 'w', encoding='utf8') as f:
            f.write(text)

1 个答案:

答案 0 :(得分:1)

lawbot.org的另一个示例:

import requests
from bs4 import BeautifulSoup

base_url = 'http://lawbot.org'
search_url = base_url + '/?q=유죄'

response = requests.get(search_url)

page = BeautifulSoup(response.text, "html.parser")
lastPageNumber = int(page.select_one("li.page-item:not(.next):nth-last-child(2)").text)

casesList = []

for i in range(1, lastPageNumber + 1):
    if i > 1:
        response = requests.get(search_url + "&page=" + str(i))
        page = BeautifulSoup(response.text, "html.parser")

    cases = page.select("div.panre_center > ul.media-list li.panre_lists")
    for case in cases:
        title = case.findChild("h6").text
        caseDocNumber = case.findChild(attrs={"class": "caseDocNumber"}).text
        caseCourt = case.findChild(attrs={"class": "caseCourt"}).text
        case_url = base_url + case.findChild("a")['href']

        casesList.append({"title": title, "caseDocNumber": caseDocNumber, "caseCourt": caseCourt, "case_url": case_url})
        # print("title:{}, caseDocNumber:{}, caseCourt:{}, caseUrl:{}".format(title, caseDocNumber, caseCourt, case_url))

for case in casesList:
    response = requests.get(case["case_url"])
    page = BeautifulSoup(response.text, "html.parser")
    body = page.find(attrs={"class": "panre_body"}).text
    print(body)