在MongoDB文档中明智地按列写入数据

时间:2018-07-03 07:51:19

标签: python excel mongodb selenium web-scraping

我做了一个刮板,可以从网站上刮取数据。截至目前,我的代码已写入excel文件。它还读取并更新excel文件。我的代码首先读取excel数据库,以确保它更新了excel工作表中的当前信息,如果网站中存在一些不在excel数据库中的新信息,则会将其添加到工作表中。

以下是代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from selenium import webdriver
import time 
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
from selenium.webdriver.chrome.options import Options
import logging


#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = [] 
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers  = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []

def main_url(url):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url) #gets the URL
    time.sleep(5) # wait 5 seconds until DOM will load completly
    while True:
        try:
            driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded. 
            time.sleep(3)
        except Exception as e:
            break

    rows = driver.find_elements_by_class_name('sec_conf_main')
    for row in rows:
        conf = row.find_element_by_class_name('conf_summery')
        nam = conf.find_element_by_class_name('c_name')
        name = nam.find_element_by_tag_name('a')

        if len(names) != 0 and name.get_attribute('title') in names:
            index = names.index(name.get_attribute('title'))

            pic = row.find_element_by_class_name('conf_logo')
            link = pic.find_element_by_tag_name('a')
            if links[index] == link:
                pass
            else:
                links[index] = link.get_attribute('href') #get link of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            img = link.find_element_by_tag_name('img')
            if pics[index] == img.get_attribute('src'):
                pass
            else:
                pics[index] = img.get_attribute('src') #picture source of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            desc = row.find_element_by_class_name('conf_desc')
            if descs[index] == desc.text:
                pass
            else:
                descs[index] = desc.text #description of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            d = conf.find_elements_by_tag_name('strong')
            count = 0
            while count < len(d):
                view = d[count].text
                if views[index] == view:
                    pass
                else:
                    views[index] = view #number of views. 
                    if not link.get_attribute('href') in update:
                        update.append(link.get_attribute('href'))

                if no_speakers[index] == d[count + 1].text:
                    pass
                else:
                    no_speakers[index] = d[count + 1].text #number of speakers. 
                    if not link.get_attribute('href') in update:
                        update.append(link.get_attribute('href'))
                count = count +  2

            t = conf.find_elements_by_class_name('spel')
            ty = []
            for item in t:
                ty.append(item.get_attribute('title'))
            if types[index] == ','.join(ty):
                pass
            else:
                types[index] = (','.join(ty))#speciality of event. 
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            date_place = conf.find_elements_by_class_name('c_summery')
            for item in date_place:
                try:
                    if item.find_element_by_tag_name('img'):
                        if location[index] == item.text:
                            pass
                        else:
                            location[index] = (item.text) #location of event 
                            if not link.get_attribute('href') in update:
                                update.append(link.get_attribute('href'))
                except NoSuchElementException as e:
                    pass
                try:
                    if item.find_element_by_tag_name('span'):
                        date = item.text
                        i = date.find('|')
                        if dates[index] == date[:i]:
                            pass
                        else:
                            dates[index] = (date[:i]) #date from and to of event. 
                            if not link.get_attribute('href') in update:
                                update.append(link.get_attribute('href'))
                except NoSuchElementException as e:
                    pass

        else:
            names.append(name.get_attribute('title')) #title of event.
            pic = row.find_element_by_class_name('conf_logo')
            link = pic.find_element_by_tag_name('a')
            links.append(link.get_attribute('href')) #get link of event. 

            img = link.find_element_by_tag_name('img')
            pics.append(img.get_attribute('src')) #picture source of event.

            desc = row.find_element_by_class_name('conf_desc')
            descs.append(desc.text) #description of event. 

            d = conf.find_elements_by_tag_name('strong')
            count = 0
            while count < len(d):
                view = d[count].text
                views.append(view) #number of views. 
                no_speakers.append(d[count + 1].text) #number of speakers. 
                count = count +  2

            t = conf.find_elements_by_class_name('spel')
            ty = []
            for item in t:
                ty.append(item.get_attribute('title'))
            types.append(','.join(ty))#speciality of event. 

            date_place = conf.find_elements_by_class_name('c_summery')
            for item in date_place:
                try:
                    if item.find_element_by_tag_name('img'):
                        location.append(item.text) #location of event 
                except NoSuchElementException as e:
                    pass
                try:
                    if item.find_element_by_tag_name('span'):
                        date = item.text
                        index = date.find('|')
                        dates.append(date[:index]) #date from and to of event. 
                except NoSuchElementException as e:
                    pass

    driver.close()
    driver.quit()

def each_event(item):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=options)
    try:
        driver.get(item) #get each Link of the event. 
        time.sleep(5)
        if len(prev_links) != 0 and item in prev_links:
            index = links.index(item)

            try:
                org = driver.find_element_by_class_name('speakers')
                l = org.text.split()
                if organization[index] == ' '.join(l[3:]):
                    pass
                else:
                    organization[index] = (' '.join(l[3:]))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                organization[index] = 'No Organization Given.'

            try:
                summary = driver.find_element_by_class_name('conf_head_summary')
                if summ[index] == summary.find_element_by_tag_name('p').text:
                    pass
                else:
                    summ[index] = (summary.find_element_by_tag_name('p').text)
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                summ[index] = 'No Conference Summary Given.'

            try:
                tw = driver.find_element_by_class_name('TW')
                if twitter[index] == tw.get_attribute('title'):
                    pass
                else:
                    twitter[index] = (tw.get_attribute('title'))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                twitter[index] = 'No Twitter Link'

            try:
                fb = driver.find_element_by_class_name('FB')
                if facebook[index] == fb.get_attribute('title'):
                    pass
                else:
                    facebook[index] = (fb.get_attribute('title'))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                facebook[index] = ('No Facebook Link')

            try:
                c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
                if contact[index] == c:
                    pass
                else:
                    if len(c) == 0:
                        contact[index] = ('No Contact Number Given.')
                    else:
                        contact[index] = (c)
                        if not item in update:
                            update.append(item)
            except NoSuchElementException as e:
                contact[index] = ('No Contact Number Given.')

            try:
                email = driver.find_elements_by_class_name('emailFruser')
                e = []
                for item in email:
                    e.append(item.text)
                if emails[index] == ','.join(e):
                    pass
                else:
                    emails[index] = (','.join(e))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                emails[index] = ('No email.')

            try:
                web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
                if website_link[index] == web:
                    pass
                else:
                    website_link[index] = (web)
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                website_link[index] = ('No Website Link')

            try:
                v = driver.find_element_by_class_name('conf_venue1').text
                if venue[index] == v:
                    pass
                else:
                    venue[index] = (v)
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                venue[index] = ('No Venue Given.')

            try:
                oa = driver.find_element_by_class_name('hotel-detail').text
                if official_address[index] == oa:
                    pass
                else:
                    official_address[index] = oa
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                official_address[index] = ('No Official Address Given. ')

            try:
                sp = driver.find_elements_by_class_name('speaker_single_inn')
                l = []
                for item in sp:
                    l.append(driver.find_element_by_xpath('//div/h5/a').text)
                if len(l) == 0:
                    speakers[index] = 'No Speakers'      

                if speakers[index] == ','.join(l):
                    pass
                else:
                    speakers[index] = (','.join(l))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                speakers[index] = ('No Speakers')

            try:
                s = driver.find_element_by_class_name('mobScroll')
                trs = s.find_elements_by_xpath('//table/tbody/tr')
                l = []
                for item in trs:
                    try:
                        item.find_element_by_class_name('ticketname_inn')
                        l.append(item.text)
                    except NoSuchElementException as e:
                        pass
                if fees[index] == ','.join(l):
                    pass
                else:
                    fees[index] = (';'.join(l))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                fees[index] = ('No Fees Given')

            try:
                sp = driver.find_elements_by_class_name('r-speaker-info')
                l = []
                for item in sp:
                    l.append(item.text)
                if len(l) == 0:
                    at_tr[index] = 'No Attenders or Trackers Given.'
                if at_tr[index] == ','.join(l):
                    pass
                else:
                    at_tr[index] = (','.join(l))
                    if not item in update:
                        update.append(item)
            except NoSuchElementException as e:
                at_tr[index] = ('No Attenders or Trackers Given')

        else:
            try:
                org = driver.find_element_by_class_name('speakers')
                l = org.text.split()
                organization.append(' '.join(l[3:]))
            except NoSuchElementException as e:
                organization.append('No Organization Given.')

            try:
                summary = driver.find_element_by_class_name('conf_head_summary')
                summ.append(summary.find_element_by_tag_name('p').text)
            except NoSuchElementException as e:
                summ.append('No Conference Summary Given.')

            try:
                tw = driver.find_element_by_class_name('TW')
                twitter.append(tw.get_attribute('title'))
            except:
                twitter.append('No Twitter Link')

            try:
                fb = driver.find_element_by_class_name('FB')
                facebook.append(fb.get_attribute('title'))
            except:
                facebook.append('No Facebook Link')

            try:
                c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
                if len(c) == 0:
                    contact.append('No Contact Number Given.')
                else:
                    contact.append(c)
            except NoSuchElementException as e:
                contact.append('No Contact Number Given.')

            try:
                email = driver.find_elements_by_class_name('emailFruser')
                e = []
                for item in email:
                    e.append(item.text)
                emails.append(' '.join(e))
            except NoSuchElementException as e:
                emails.append('No email.')

            try:
                website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
            except NoSuchElementException as e:
                website_link.append('No Website Link')

            try:
                venue.append(driver.find_element_by_class_name('conf_venue1').text)
            except NoSuchElementException as e:
                venue.append('No Venue Given.')

            try:
                official_address.append(driver.find_element_by_class_name('hotel-detail').text)
            except NoSuchElementException as e:
                official_address.append('No Official Address Given. ')

            try:
                sp = driver.find_elements_by_class_name('speaker_single_inn')
                l = []
                for item in sp:
                    l.append(driver.find_element_by_xpath('//div/h5/a').text)
                if len(l) == 0:
                    speakers.append('No Speakers Given.')
                else:
                    speakers.append(','.join(l))
            except NoSuchElementException as e:
                speakers.append('No Speakers')

            try:
                s = driver.find_element_by_class_name('mobScroll')
                trs = s.find_elements_by_xpath('//table/tbody/tr')
                l = []
                for item in trs:
                    try:
                        item.find_element_by_class_name('ticketname_inn')
                        l.append(item.text)
                    except NoSuchElementException as e:
                        pass
                fees.append(';'.join(l))
            except NoSuchElementException as e:
                fees.append('No Fees Given')

            try:
                sp = driver.find_elements_by_class_name('r-speaker-info')
                l = []
                for item in sp:
                    l.append(item.text)
                if len(l) == 0:
                    at_tr.append('No Attenders or Trackers Given')
                else:
                    at_tr.append(','.join(l))
            except NoSuchElementException as e:
                at_tr.append('No Attenders or Trackers Given')

        driver.close()
        driver.quit()
    except Exception as e:
        pass

def main():
    file = 'EMedEvents.xlsx' #file to write in
    book = open_workbook(file)
    sheet = book.sheet_by_index(0)

    d = pd.read_excel(file)
    if d.empty:
        pass
    else:
        for row in range(1, sheet.nrows):
            names.append(sheet.cell(row, 0).value)
            dates.append(sheet.cell(row, 1).value)
            types.append(sheet.cell(row, 2).value)
            location.append(sheet.cell(row, 3).value)
            descs.append(sheet.cell(row, 4).value)
            views.append(sheet.cell(row, 5).value)
            no_speakers.append(sheet.cell(row, 6).value)
            pics.append(sheet.cell(row, 7).value)
            links.append(sheet.cell(row, 8).value)
            organization.append(sheet.cell(row, 9).value)
            summ.append(sheet.cell(row, 10).value)
            twitter.append(sheet.cell(row, 11).value)
            facebook.append(sheet.cell(row, 12).value)
            contact.append(sheet.cell(row, 13).value)
            emails.append(sheet.cell(row, 14).value)
            website_link.append(sheet.cell(row, 15).value)
            venue.append(sheet.cell(row, 16).value)
            official_address.append(sheet.cell(row, 17).value)
            speakers.append(sheet.cell(row, 18).value)
            fees.append(sheet.cell(row, 19).value)
            at_tr.append(sheet.cell(row, 20).value)

    if len(links) != 0:
        for item in links:
            prev_links.append(item)

    main_url("https://www.emedevents.com/india-medical-conferences") #main url to use. 
    for item in links:
        each_event(item) #get people information of each event. 

    df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs, 
                                 'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization, 
                                 'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact, 
                                 'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
                                 'Fees' : fees, 'Attenders and Trackers': at_tr})
    df.to_excel(file, header=True, index=False) #print the data in the excel sheet. 

    logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
    logging.info('%d events were read from the excel sheet', len(prev_links))
    logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
    logging.info('Following are the links of the events that were updated:')
    for item in update:
        logging.info(item)

if __name__ == '__main__':
    main() #if the name is main, run the main method and continue with the program. 

我需要在MongoDB中而不是在Excel中完成所有这些功能。我对MongoDB完全陌生,因此我不了解可以帮助自己摆脱困境的任何步骤。

任何帮助都会很棒。 提前致谢。

1 个答案:

答案 0 :(得分:0)

您可能应该为Python使用某种mongo API,例如: https://github.com/mongodb/mongo-python-driver/blob/master/README.rst

如果您完全不了解mongo,请从基础教程开始,然后使用API​​将值写入数据库: https://www.tutorialspoint.com/mongodb/mongodb_tutorial.pdf