global db
cursor=db.cursor()
query="""INSERT INTO Properties (Property_Num, Published_Date, Title,Price,Bedroom,Agency_Fee, Bathroom, Size,Prop_ref,Furnished_status,Rent_payment,Building_info,Amenities,Trade_name,Licence, RERA_ID,Phone_info,Short_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
cursor.execute(query,(count_curr, date_result, title_result, price_result, bedroom_result, agencyfee_result, bathroom_result, size_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))
db.commit()
cursor.close()
db.close()
循环结构是这样的,数据库连接在函数every_property_in_a_page_data_extraction(a ['href'])
while i<=40:
for a in link: #has 25 loops which prints something
every_property_in_a_page_data_extraction(a['href'])
i+=1
问题在于,每次执行代码时,它都会打印出25个项目,甚至没有执行代码(我通过放置一个未在前25个循环中打印的伪打印语句来检查它)。它只是简单地打印25次然后代码执行。在第一个循环中不执行代码的格式化和更改。它就像记住数据库中的某些内容并继续执行while语句的每个第一个循环。从第二个循环开始打印好
这是我的整个计划(供参考)
import urllib
from bs4 import BeautifulSoup
import MySQLdb
import re
import pdb
def getting_urls_of_all_pages():
print "here is the fist loop"
i=1
while i<=40: #40 is the total number of main pages
url_rent_flat='http://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/?page='+str(i) #url of the main page (iterating to 40)
link=[]
htmlfile=urllib.urlopen(url_rent_flat).read()
soup=BeautifulSoup(htmlfile)
link=soup.find_all('a',xtclib=re.compile("listing_list_\d+_title_link"),href=True) #stores all the links (25) links of the page
"""
Part 2: passing each property url to process for data extraction
"""
for a in link:
every_property_in_a_page_data_extraction(a['href'])
i+=1
def every_property_in_a_page_data_extraction(url):
print "here is the second loop"
global count_prop
count_curr=""
date_result=""
title_result=""
price_result=""
bedroom_result=""
agencyfee_result=""
bathroom_result=""
size_result=""
propertyref_result=""
furnished_result=""
rent_is_paid_result=""
building_result=""
Amenities_result=""
tradename_result=""
licencenum_result=""
reraid_result=""
phone_result=""
link_result=""
"""
Part1: Extracting data
"""
try:
htmlfile=urllib.urlopen(url).read()
soup=BeautifulSoup(htmlfile)
print "\nProperty Number: ", count_prop,
count_curr=str(count_prop)
except:
count_curr="Error"
#date
try:
date=soup.find("h3","listing-details-header","span")
date_result= str(date.get_text().encode("utf-8").strip()[20:])
print "\nPublished date: ", date_result
except:
date_result="Error"
#title
try:
title= soup.find('span',{'id':'listing-title-wrap'})
title_result= str(title.get_text().strip().encode("utf-8"))
print "Title: ",title_result
except:
title_result="Error"
#price
try:
price = soup.find('span',{'id':'actualprice'})
price_result= str(price.get_text())
print "Price: ",price_result
except:
price_result="Error"
#Agency Fee, Bedroom, Bathroom, Size
spans_ABBS= []
for a in soup.select(".important-fields li span"):
spans_ABBS.append(a.text.strip())
strongs_ABBS=[]
for a in soup.select(".important-fields li strong"):
strongs_ABBS.append(a.text.strip())
for name, value in zip(spans_ABBS, strongs_ABBS):
if name=="Agency Fees:":
try:
agencyfee_result= str(value)
print "Agency Fees: ", agencyfee_result
except:
agencyfee_result="Error"
elif name=="Bedrooms:":
try:
bedroom_result= str(value)
print "Number of Bedrooms: ",bedroom_result
except:
bedroom_result="Error"
elif name=="Bathrooms:":
try:
bathroom_result= str(value)
print "Number of Bathrooms: ", bathroom_result
except:
bathroom_result="Error"
elif name=="Size:":
try:
size_result= str(value)
print "Size of the property: ",size_result
except:
size_result="Error"
#Property Reference, Furnished, Listed By, Rent Is Paid, Building, Amenities:
spans_others=[]
for a in soup.select("#listing-details-list li span"):
spans_others.append(a.text.strip())
strongs_others=[]
for a in soup.select("#listing-details-list li strong"):
strongs_others.append(a.text.strip())
for name, value in zip(spans_others, strongs_others):
if name=="Listed by:":
break
elif name=="Property Reference:":
try:
propertyref_result=str(value.strip())
print "Property reference in Dubizel: ",propertyref_result
except:
propertyref_result="Error"
elif name=="Furnished:":
try:
furnished_result=str(value.strip())
print "Furnished status: ",furnished_result
except:
furnished_result="Error"
elif name=="Rent Is Paid:":
try:
rent_is_paid_result=str(value.strip())
print "Rent payment: ",rent_is_paid_result
except:
rent_is_paid_result="Error"
elif name=="Building:":
try:
building_result=str(value.strip())
print "Building info: ",building_result
except:
building_result="Error"
elif name=="Amenities:":
try:
for a in value.split(","):
Amenities_result+=a.strip()+","
print Amenities_result
except:
Amenities_result="Error"
#Agents info --> TTrade Name, DED Licence Number, RERA Registration Number
spans_broker=[]
for a in soup.select("#broker-details li span"):
spans_broker.append(a.text.strip())
strongs_broker=[]
for a in soup.select("#broker-details li strong"):
strongs_broker.append(a.text.strip())
for name, value in zip(spans_broker, strongs_broker):
if name=="Trade Name:":
try:
tradename_result=str(value.strip())
print "Trade name: ",tradename_result
except:
tradename_result="Error"
elif name=="DED Licence Number:":
try:
licencenum_result=str(value.strip())
print "Licence #: ",licencenum_result
except:
licencenum_result="Error"
elif name=="RERA Registration Number:":
try:
reraid_result=str(value.strip())
print "RERA ID #: ",reraid_result
except:
reraid_result="Error"
# phone num
try:
phone=soup.find("div", "phone-content")
for a in phone:
phone_result= str(a).get_text().strip().encode("utf-8")
print "Phone information:", phone_result
except:
phone_result="Error"
#link
try:
link = soup.find('input',{'id':'short-link-input'})
link_result= str(link.get('value'))
print "Short Reference link: ", link_result
except:
link_result="Error"
# #double check of the types before conversion
# print map(type, (count_curr, date_result, title_result, price_result, bedroom_result, agencyfee_result, bathroom_result, size_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result,licencenum_result,reraid_result,phone_result,link_result))
count_prop+=1
# """
# Connecting to Database and putting data into in
# """
db= MySQLdb.connect("localhost","root","ahmed","Properties")
cursor=db.cursor()
query="""INSERT INTO Properties (Property_Num, Published_Date, Title,Price,Bedroom,Agency_Fee, Bathroom, Size,Prop_ref,Furnished_status,Rent_payment,Building_info,Amenities,Trade_name,Licence, RERA_ID,Phone_info,Short_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
cursor.execute(query,(count_curr, date_result, title_result, price_result, bedroom_result, agencyfee_result, bathroom_result, size_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))
db.commit()
cursor.close()
db.close()
#-----------------------------------------------------------
count_prop=1
getting_urls_of_all_pages()