在python 3.4中使用bs4查找页面中的所有文本

时间:2015-03-03 08:38:27

标签: python-3.x beautifulsoup

我正在尝试用bs4抓取一个网址,但我无法获得网页上所有可见的字词。我也试过了find_all(text=true),但没有运气。

网址:http://www.ccs.neu.edu/home/mates/blog.html

import argparse
import os
import re
import requests
from collections import OrderedDict
from bs4 import BeautifulSoup

title = "Written by Sagar"
print(title)
parser = argparse.ArgumentParser(description=title,        formatter_class=argparse.RawTextHelpFormatter)

operationGroup = parser.add_mutually_exclusive_group(required=True)
operationGroup.add_argument('-l', action="store", dest="webList",
                            help="Specify a text file with a list of URLs to  scrape (separated by newline).")

optionGroup = parser.add_argument_group('paramters and options')
optionGroup.add_argument('-o', action="store", dest="outputFile",
                         help="Output filename. (Default: wordlist.txt)")
optionGroup.add_argument('-min', action="store", dest="minLength", type=int,
                         help="Set the minimum number of characters for each word (Default: 3).")
optionGroup.add_argument('-max', action="store", dest="maxLength", type=int,
                         help="Set the maximum number of characters for each word (Default: 30).")

args = parser.parse_args()


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True


def webUrl(fullUrl):
    # URL validation
    validUrl = re.compile(
        r'^(?:http)s?://|'  # http:// or https://
        r'^(?:http)s?://www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    if validUrl.match(fullUrl):
        try:
            u = requests.get(fullUrl)
            html = u.content.decode('utf8')
            soup = BeautifulSoup(html)
            # tokens = soup.get_text()
            texts = soup.get_text()
            visible_texts = filter(visible, texts)


            if args.minLength or args.maxLength:
                for token in texts:
                    if not (len(token.translate(None, charBlacklist)) < minl or len(token) > maxl):
                        wordList.append(str(token))
            else:
                for token in texts:
                    wordList.append(str(token))

            print("Scraping URL - {0}".format(fullUrl))
        except Exception as e:
            print('There was an error connecting to or parsing {0}'.format(fullUrl))
            print('Error: %s' % e)
    else:
        print('INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl))


def webList(webListFile):
    if os.path.isfile(webListFile):
        with open(webListFile) as f:
            webList = f.readlines()

        for url in webList:
            webUrl(url.rstrip('\n'))

        f.close()
    else:
        print('Error opening file')


def output():
    try:
        if not args.outputFile:
            args.outputFile = 'wordlist.txt'
        outputFile = open(args.outputFile, 'w')
        wordListFinal = OrderedDict.fromkeys(wordList).keys()

        for word in wordListFinal:
            print(word)
            outputFile.write(word)
            outputFile.write('\n')
        outputFile.close()

        print('\n{0} unique words have been scraped.'.format(len(wordList)))
        print('Output file successfully written: {0}'.format(outputFile.name))
    except Exception as e:
        print('Error creating output file: {0}'.format(outputFile.name))
        print(e)


if __name__ == "__main__":

    wordList = list()
    charBlacklist = ""

    if args.minLength or args.maxLength:
        minl = args.minLength if args.minLength else 3
        maxl = args.maxLength if args.maxLength else 30
        if minl > maxl:
            print('Argument minLength cannot be greater than maxLength. Setting defaults to min=3 max=30.')
            minl = 3
            maxl = 30

    charBlacklist = ""

    if args.webList:
        webList(args.webList)

    output()

0 个答案:

没有答案