我在Python中编写代码以获取所有' a'使用Beautiful soup的URL中的标签,然后我使用位置3处的链接,然后我应该关注该链接,我将重复此过程大约18次。我包含了下面的代码,该代码重复了两次。我无法在循环中重复相同的过程18次。任何帮助都将受到赞赏。
import re
import urllib
from BeautifulSoup import *
htm1= urllib.urlopen('https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html ').read()
soup =BeautifulSoup(htm1)
tags = soup('a')
list1=list()
for tag in tags:
x = tag.get('href', None)
list1.append(x)
M= list1[2]
htm2= urllib.urlopen(M).read()
soup =BeautifulSoup(htm2)
tags1 = soup('a')
list2=list()
for tag1 in tags1:
x2 = tag1.get('href', None)
list2.append(x2)
y= list2[2]
print y
好的,我刚刚编写了这段代码,它正在运行,但我在结果中获得了相同的4个链接。看起来循环中有问题(请注意:我尝试循环4次)
import re
import urllib
from BeautifulSoup import *
list1=list()
url = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html'
for i in range (4): # repeat 4 times
htm2= urllib.urlopen(url).read()
soup1=BeautifulSoup(htm2)
tags1= soup1('a')
for tag1 in tags1:
x2 = tag1.get('href', None)
list1.append(x2)
y= list1[2]
if len(x2) < 3: # no 3rd link
break # exit the loop
else:
url=y
print y
答案 0 :(得分:8)
我无法在循环中重复相同的过程18次。
要在Python中重复18次,可以使用for _ in range(18)
循环:
#!/usr/bin/env python2
from urllib2 import urlopen
from urlparse import urljoin
from bs4 import BeautifulSoup # $ pip install beautifulsoup4
url = 'http://example.com'
for _ in range(18): # repeat 18 times
soup = BeautifulSoup(urlopen(url))
a = soup.find_all('a', href=True) # all <a href> links
if len(a) < 3: # no 3rd link
break # exit the loop
url = urljoin(url, a[2]['href']) # 3rd link, note: ignore <base href>
答案 1 :(得分:1)
您应该使用递归http://www.python-course.eu/recursive_functions.php
$('select#ddlCountry option').each(function () {
if ($(this).text().toLowerCase() == co.toLowerCase()) {
this.selected = true;
return;
} });
如果你想设置一个你想去的等级的限制,你可以将你所处的等级作为一个参数传递,例如:
def GetLinks(initialPage):
htm1= urllib.urlopen(initialPage).read()
soup =BeautifulSoup(htm1)
tags = soup('a')
list1=list()
for tag in tags:
x = tag.get('href', None)
list1.append(x)
list1.append(GetLinks(x))
return list1
答案 2 :(得分:1)
import urllib
from BeautifulSoup import *
URL = raw_input("Enter the URL:") #Put insurance
link_line = int(raw_input("Enter the line of the desired link:")) - 1 #Put insurance
count = int(raw_input("Enter the loop repeat times:")) #Put insurance
while count >= 0:
html = urllib.urlopen(URL).read()
soup = BeautifulSoup(html)
tags = soup('a')
print URL
URL = tags[link_line].get("href", None)
count = count - 1
答案 3 :(得分:1)
import urllib
from BeautifulSoup import *
url = raw_input('http://example')
for i in range(18):
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
tags = soup('a')
s = []
for tag in tags:
x = tag.get('href', None)
s.append(x)
print s[3]
url = s[3]
print "ansver:", s[3]
答案 4 :(得分:1)
import urllib
from BeautifulSoup import *
url='http://python-data.dr-chuck.net/known_by_Eesa.html'
counts=raw_input('Enter number of pages to jump: ')
counts=int(counts)
pos=raw_input('Enter position: ')
pos=int(pos)
y1= list()
y2=list()
count=0
while True:
data=urllib.urlopen(url).read()
soup= BeautifulSoup(data)
tags=soup('a')
for tag in tags:
value=tag.get('href',None)
value=str(value)
y1.append(value)
t=y1[pos-1]
y2.append(t)
y1=[]
count=count+1
if count==counts:break
else:
url=t
continue
print y2
答案 5 :(得分:1)
url = input('Enter - ')
def functiontofollowlink(url):
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
count=0
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
#print(tag.get('href', None))
count = count+1
if count==18:
url=tag.get('href', None)
return url
numberoftimes = int(input('Enter number of times to repeat:'))
#for fisrt you need to give the link then for numberoftimes decrease number by 1
#since we know previously which position to check so directly used in the function count = 18 else you can give number externally
#positionoflink = input('Enter position of the link:')
while numberoftimes>=0:
numberoftimes=numberoftimes-1
url=functiontofollowlink(url)
print(url)
这为您提供了您想要的确切查询输出
答案 6 :(得分:1)
我会在循环内部使用循环来重复循环,并使用count来中断以使其停在你想要的链接上。
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import ssl
import re
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "your first link here"
#to repeat 18 times#
for i in range(18):
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
count = 0
for tag in tags:
count = count +1
#make it stop at position 3#
if count>3:
break
url = tag.get('href', None)
print(url)
答案 7 :(得分:0)
class ToWord
{
public void CopyToWord(string fpath, int qnumber)
{
Workbook spirewb = new Workbook();
spirewb.LoadFromFile(fpath);
Document doc = new Document();
for (int i = 1; i <= qnumber; i++)
{
Spire.Xls.Worksheet sheet = spirewb.Worksheets[i];
Table table = doc.AddSection().AddTable(true);
table.ResetCells(sheet.LastRow, sheet.LastColumn);
for (int r = 1; r <= sheet.LastRow; r++)
{
for (int c = 1; c <= sheet.LastColumn; c++)
{
CellRange xCell = sheet.Range[r, c];
TableCell wCell = table.Rows[r - 1].Cells[c - 1];
//fill data to word table
TextRange textRange = wCell.AddParagraph().AppendText(xCell.NumberText);
//copy font and cell style from excel to word
CopyStyle(textRange, xCell, wCell);
}
}
}
doc.SaveToFile("result.doc", Spire.Doc.FileFormat.Doc);
System.Diagnostics.Process.Start("result.doc");
}
private static void CopyStyle(TextRange wTextRange, CellRange xCell, TableCell wCell)
{
//copy font stlye
wTextRange.CharacterFormat.TextColor = xCell.Style.Font.Color;
wTextRange.CharacterFormat.FontSize = (float)xCell.Style.Font.Size;
wTextRange.CharacterFormat.FontName = xCell.Style.Font.FontName;
wTextRange.CharacterFormat.Bold = xCell.Style.Font.IsBold;
wTextRange.CharacterFormat.Italic = xCell.Style.Font.IsItalic;
//copy backcolor
wCell.CellFormat.BackColor = xCell.Style.Color;
//copy text alignment
switch (xCell.HorizontalAlignment)
{
case HorizontalAlignType.Left:
wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Left;
break;
case HorizontalAlignType.Center:
wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Center;
break;
case HorizontalAlignType.Right:
wTextRange.OwnerParagraph.Format.HorizontalAlignment = HorizontalAlignment.Right;
break;
}
}
}
答案 8 :(得分:0)
我发现使用 while 循环使代码更干净,并且可以选择更改该代码的输入。
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter url: ')
count = input('Enter count: ')
count = int(count)
pos = input('Enter position:')
pos = int(pos)
while count > 0:
# Re-opens the link
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
#Extract 'href=' values
lst= list()
tags = soup("a")
for tag in tags:
href = tag.get("href", None)
lst.append(href)
#prints only the url 'http:/...'
url = lst[pos]
#prints out the url on that position
print('Retrieving:', url)
#makes sure the loop isn't infinite
count = count - 1