关注网站链接的重复流程(BeautifulSoup)

时间:2015-11-26 04:39:46

标签: python loops beautifulsoup

我在Python中编写代码以获取所有' a'使用Beautiful soup的URL中的标签,然后我使用位置3处的链接,然后我应该关注该链接,我将重复此过程大约18次。我包含了下面的代码,该代码重复了两次。我无法在循环中重复相同的过程18次。任何帮助都将受到赞赏。

import re
import urllib

from BeautifulSoup import *
htm1= urllib.urlopen('https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html ').read()
soup =BeautifulSoup(htm1)
tags = soup('a')
list1=list()
for tag in tags:
    x = tag.get('href', None)
    list1.append(x)

M= list1[2]

htm2= urllib.urlopen(M).read()
soup =BeautifulSoup(htm2)
tags1 = soup('a')
list2=list()
for tag1 in tags1:
    x2 = tag1.get('href', None)
    list2.append(x2)

y= list2[2]
print y

好的,我刚刚编写了这段代码,它正在运行,但我在结果中获得了相同的4个链接。看起来循环中有问题(请注意:我尝试循环4次)

import re
import urllib
from BeautifulSoup import *
list1=list()
url = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html'

for i in range (4):  # repeat 4 times
    htm2= urllib.urlopen(url).read()
    soup1=BeautifulSoup(htm2)
    tags1= soup1('a')
    for tag1 in tags1:
        x2 = tag1.get('href', None)
        list1.append(x2)
    y= list1[2]
    if len(x2) < 3:  # no 3rd link
        break  # exit the loop
    else:
        url=y             
    print y

9 个答案:

答案 0 :(得分:8)

  

我无法在循环中重复相同的过程18次。

要在Python中重复18次,可以使用for _ in range(18)循环:

#!/usr/bin/env python2
from urllib2 import urlopen
from urlparse import urljoin
from bs4 import BeautifulSoup # $ pip install beautifulsoup4

url = 'http://example.com'
for _ in range(18):  # repeat 18 times
    soup = BeautifulSoup(urlopen(url))
    a = soup.find_all('a', href=True)  # all <a href> links
    if len(a) < 3:  # no 3rd link
        break  # exit the loop
    url = urljoin(url, a[2]['href'])  # 3rd link, note: ignore <base href>

答案 1 :(得分:1)

您应该使用递归http://www.python-course.eu/recursive_functions.php

$('select#ddlCountry option').each(function () {
if ($(this).text().toLowerCase() == co.toLowerCase()) {
    this.selected = true;
    return;
} });

如果你想设置一个你想去的等级的限制,你可以将你所处的等级作为一个参数传递,例如:

def GetLinks(initialPage):
    htm1= urllib.urlopen(initialPage).read()
    soup =BeautifulSoup(htm1)
    tags = soup('a')
    list1=list()
    for tag in tags:
        x = tag.get('href', None)
        list1.append(x)
        list1.append(GetLinks(x))
    return list1

答案 2 :(得分:1)

import urllib
from BeautifulSoup import *

URL = raw_input("Enter the URL:") #Put insurance
link_line = int(raw_input("Enter the line of the desired link:")) - 1 #Put insurance
count = int(raw_input("Enter the loop repeat times:")) #Put insurance

while count >= 0:
    html = urllib.urlopen(URL).read()
    soup = BeautifulSoup(html)
    tags = soup('a')
    print URL
    URL = tags[link_line].get("href", None)
    count = count - 1

答案 3 :(得分:1)

import urllib
from BeautifulSoup import *
url = raw_input('http://example')
for i in range(18):
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)

    tags = soup('a')
    s = []

    for tag in tags:
        x = tag.get('href', None)
        s.append(x)
    print s[3]
    url = s[3]
print "ansver:", s[3]

答案 4 :(得分:1)

import urllib
from BeautifulSoup import *

url='http://python-data.dr-chuck.net/known_by_Eesa.html'
counts=raw_input('Enter number of pages to jump: ')
counts=int(counts)
pos=raw_input('Enter position: ')
pos=int(pos)
y1= list()
y2=list()
count=0
while True:
   data=urllib.urlopen(url).read()
   soup= BeautifulSoup(data)
   tags=soup('a')
   for tag in tags:
       value=tag.get('href',None)
       value=str(value)
       y1.append(value)
   t=y1[pos-1]
   y2.append(t)
   y1=[]
   count=count+1 
   if count==counts:break
   else:
    url=t
    continue
print y2

答案 5 :(得分:1)

url = input('Enter - ')

def functiontofollowlink(url):

    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    count=0
    # Retrieve all of the anchor tags
    tags = soup('a')
    for tag in tags:
        #print(tag.get('href', None))
        count = count+1
        if count==18:
           url=tag.get('href', None)
return url
numberoftimes = int(input('Enter number of times to repeat:'))
#for fisrt you need to give the link then for numberoftimes decrease number by 1
#since we know previously which position to check so directly used in the function count = 18 else you can give number externally
#positionoflink = input('Enter position of the link:')

while numberoftimes>=0:
    numberoftimes=numberoftimes-1
    url=functiontofollowlink(url)

print(url)

这为您提供了您想要的确切查询输出

答案 6 :(得分:1)

我会在循环内部使用循环来重复循环,并使用count来中断以使其停在你想要的链接上。

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import ssl
import re

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "your first link here"
#to repeat 18 times#
for i in range(18):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    count = 0
    for tag in tags:
        count = count +1
        #make it stop at position 3#
        if count>3:
            break
        url = tag.get('href', None)
print(url)

答案 7 :(得分:0)

class ToWord
    {
        public void CopyToWord(string fpath, int qnumber)
        {
            Workbook spirewb = new Workbook();
            spirewb.LoadFromFile(fpath);

            Document doc = new Document();
            for (int i = 1; i <= qnumber; i++)
            {
                Spire.Xls.Worksheet sheet = spirewb.Worksheets[i];

                Table table = doc.AddSection().AddTable(true);
                table.ResetCells(sheet.LastRow, sheet.LastColumn);

                for (int r = 1; r <= sheet.LastRow; r++)
                {
                    for (int c = 1; c <= sheet.LastColumn; c++)
                    {
                        CellRange xCell = sheet.Range[r, c];
                        TableCell wCell = table.Rows[r - 1].Cells[c - 1];
                        //fill data to word table
                        TextRange textRange = wCell.AddParagraph().AppendText(xCell.NumberText);
                        //copy font and cell style from excel to word
                        CopyStyle(textRange, xCell, wCell);
                    }
                }
            }
                
            doc.SaveToFile("result.doc", Spire.Doc.FileFormat.Doc);
            System.Diagnostics.Process.Start("result.doc");
        }

        private static void CopyStyle(TextRange wTextRange, CellRange xCell, TableCell wCell)
        {
            //copy font stlye
            wTextRange.CharacterFormat.TextColor = xCell.Style.Font.Color;
            wTextRange.CharacterFormat.FontSize = (float)xCell.Style.Font.Size;
            wTextRange.CharacterFormat.FontName = xCell.Style.Font.FontName;
            wTextRange.CharacterFormat.Bold = xCell.Style.Font.IsBold;
            wTextRange.CharacterFormat.Italic = xCell.Style.Font.IsItalic;
            //copy backcolor
            wCell.CellFormat.BackColor = xCell.Style.Color;
            //copy text alignment
            switch (xCell.HorizontalAlignment)
            {
                case HorizontalAlignType.Left:
                    wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Left;
                    break;
                case HorizontalAlignType.Center:
                    wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Center;
                    break;
                case HorizontalAlignType.Right:
                    wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Right;
                    break;
            }
        }
    }

答案 8 :(得分:0)

我发现使用 while 循环使代码更干净,并且可以选择更改该代码的输入。

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import ssl
    
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    url = input('Enter url: ')
    
    count = input('Enter count: ')
    count = int(count)
    
    pos = input('Enter position:')
    pos = int(pos)
    
    while count > 0:
        # Re-opens the link
        html = urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, "html.parser")
        
        #Extract 'href=' values
        lst= list()
        tags = soup("a")
        for tag in tags:
            href = tag.get("href", None)
            lst.append(href)
        #prints only the url 'http:/...'
        url = lst[pos]
        #prints out the url on that position
        print('Retrieving:', url)
        
        #makes sure the loop isn't infinite
        count = count - 1