我正试图通过以下代码从网站上获取一些信息:
我认为我没有正确使用正则表达式?
checkoutURL = "www.arbitrarycustomercheckout.com"
bagInfo = requests.get(checkoutURL).content
soup = bs4.BeautifulSoup(bagInfo)
for link in soup.findAll('htmlele'):
print link.get("htmlAttr"), link
print
答案 0 :(得分:0)
如评论中所述,DOMDocument和DOMXPath是可行的方法。我不是专业人士所以对我来说这是一个新领域。
我最终得到的代码按照我想要的方式运行:
include('../assets/db_conn.php');
$content = file_get_contents('http://domain.topdomain');
$content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");
$doc = new DomDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($content);
libxml_use_internal_errors(false);
$xpath = new DomXpath($doc);
$articles = $xpath->query("//div[@class='posts']/div[@class='post']");
$results = array();
foreach ($articles as $article) {
$node = $xpath->query("a/attribute::href", $article);
$result['href'] = $node->item(0)->value;
$node = $xpath->query("a/div[@class='post-image']/img/attribute::src", $article);
$result['image_src'] = $node->item(0)->value;
$node = $xpath->query("a/div[@class='post-image']/span[@class='post-site-name']", $article);
$result['site_name'] = $node->item(0)->textContent;
$node = $xpath->query("a/div[@class='post-box']/h3[@class='post-title']", $article);
$result['title'] = $node->item(0)->textContent;
////////////////////////
$content1 = file_get_contents('http://domain.topdomain'.$result['href']);
$doc1 = new DomDocument();
libxml_use_internal_errors(true);
$doc1->loadHTML($content1);
libxml_use_internal_errors(false);
$xpath1 = new DomXpath($doc1);
$articles1 = $xpath1->query("//div[@class='video-container']");
foreach ($articles1 as $article1) {
$node1 = $xpath1->query("iframe[@class='youtube-player']/attribute::src", $article1);
$youtube_href = $node1->item(0)->value; // get the first node in the list which is a DOMAttr
$youtube_href = explode('//www.youtube.com/embed/', $youtube_href);
$youtube_href = explode('?', $youtube_href[1]);
$result['youtube_href'] = $youtube_href[0];
}
////////////////////////
if($result['youtube_href'] != 'random') {
$results[] = $result;
}
}
$query = "SELECT yt_url FROM article";
$stmt = $dbh->prepare($query);
$stmt->execute();
$urls = $stmt->fetchAll();
$yt_urls = array();
foreach($urls as $url) {
$yt_urls[] = $url['yt_url'];
}
print_r($yt_urls);
$stmt = $dbh->prepare("INSERT INTO article (title, fb_title, slug, yt_url, img_url) VALUES (:title, :fb_title, :slug, :yt_url, :img_url)");
foreach($results as $value) {
if (!in_array($value['youtube_href'], $yt_urls)) {
echo $value['youtube_href'].'</br>';
$slug = ltrim($value['href'], '/');
$stmt->bindParam(':title', $value['title']);
$stmt->bindParam(':fb_title', $value['title']);
$stmt->bindParam(':slug', $slug);
$stmt->bindParam(':yt_url', $value['youtube_href']);
$stmt->bindParam(':img_url', $value['image_src']);
$stmt->execute();
}
}