简单的正则表达式PHP(Scrape网站)

时间:2015-10-28 16:49:40

标签: php regex preg-match-all

我正试图通过以下代码从网站上获取一些信息:

我认为我没有正确使用正则表达式?

checkoutURL = "www.arbitrarycustomercheckout.com"

bagInfo = requests.get(checkoutURL).content
soup = bs4.BeautifulSoup(bagInfo)

for link in soup.findAll('htmlele'):
    print link.get("htmlAttr"), link
print

1 个答案:

答案 0 :(得分:0)

如评论中所述,DOMDocument和DOMXPath是可行的方法。我不是专业人士所以对我来说这是一个新领域。

我最终得到的代码按照我想要的方式运行:

include('../assets/db_conn.php');

$content = file_get_contents('http://domain.topdomain');
    $content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");

    $doc = new DomDocument();
    libxml_use_internal_errors(true);
    $doc->loadHTML($content);
    libxml_use_internal_errors(false);
    $xpath = new DomXpath($doc);
    $articles = $xpath->query("//div[@class='posts']/div[@class='post']");
    $results = array();
    foreach ($articles as $article) {

    $node = $xpath->query("a/attribute::href", $article);
    $result['href'] = $node->item(0)->value;

    $node = $xpath->query("a/div[@class='post-image']/img/attribute::src", $article);
    $result['image_src'] = $node->item(0)->value;

    $node = $xpath->query("a/div[@class='post-image']/span[@class='post-site-name']", $article);
    $result['site_name'] = $node->item(0)->textContent;

    $node = $xpath->query("a/div[@class='post-box']/h3[@class='post-title']", $article);
    $result['title'] = $node->item(0)->textContent;

  ////////////////////////
    $content1 = file_get_contents('http://domain.topdomain'.$result['href']);

    $doc1 = new DomDocument();
    libxml_use_internal_errors(true);
    $doc1->loadHTML($content1);
    libxml_use_internal_errors(false);
    $xpath1 = new DomXpath($doc1);
    $articles1 = $xpath1->query("//div[@class='video-container']");

    foreach ($articles1 as $article1) {
        $node1 = $xpath1->query("iframe[@class='youtube-player']/attribute::src", $article1);
        $youtube_href = $node1->item(0)->value; // get the first node in the list which is a DOMAttr
        $youtube_href = explode('//www.youtube.com/embed/', $youtube_href);
        $youtube_href = explode('?', $youtube_href[1]);

        $result['youtube_href'] = $youtube_href[0];
    }

  ////////////////////////
    if($result['youtube_href'] != 'random') {
        $results[] = $result;
    }
}

$query =    "SELECT yt_url FROM article";
$stmt = $dbh->prepare($query);
$stmt->execute();
$urls = $stmt->fetchAll();
$yt_urls = array();
foreach($urls as $url) {
    $yt_urls[] = $url['yt_url'];
}
print_r($yt_urls);
$stmt = $dbh->prepare("INSERT INTO article (title, fb_title, slug, yt_url, img_url) VALUES (:title, :fb_title, :slug, :yt_url, :img_url)");

foreach($results as $value) {
    if (!in_array($value['youtube_href'], $yt_urls)) {
        echo $value['youtube_href'].'</br>';
        $slug = ltrim($value['href'], '/');

        $stmt->bindParam(':title', $value['title']);
        $stmt->bindParam(':fb_title', $value['title']);
        $stmt->bindParam(':slug', $slug);
        $stmt->bindParam(':yt_url', $value['youtube_href']);
        $stmt->bindParam(':img_url', $value['image_src']);

        $stmt->execute();
    }
}