webscraper抓取图像,但不将信息输入数据库

时间:2011-01-08 05:47:25

标签: php sql dom curl

一次。我的脚本在我的数据库中输入信息时遇到了更多问题。 下面的脚本抓取页面,删除必要的信息,然后下载相关的图像文件。之后,它应该将从URL收集的信息输入数据库。出于某种原因,脚本似乎遍历URL,因为我获得了每个URL的下载图像,但是每个URL的产品都没有输入到数据库中。该脚本将插入第一个产品的类别和产品信息,然后它就会停止,并继续下载图像。

有什么建议吗?

<?php

define('IN_PHPBB', true);
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/simple_html_dom.' . $phpEx);

// Start session management
$user->session_begin();
$auth->acl($user->data);
$user->setup();

set_time_limit(259200);

function save($in, $out)
{
    $ch = curl_init ($in);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
    $rawdata=curl_exec($ch);
    curl_close ($ch);
    if(file_exists($out))
    {
        unlink($out);
    }
    $fp = fopen($out,'x');
    fwrite($fp, $rawdata);
    fclose($fp);
}

function scrape($i)
{
    $url = 'http:/xxxxxxxx/index.php?main_page=product_info&products_id='.$i.'&zenid=e4b7dde8de02e1df005d4549e2e3e529';
    echo "$url -- ";
    $exists = file_get_contents($url);
    if ($exists != false)
    {
        $html = file_get_html($url); 

        foreach($html->find('body') as $html)
        {
            $test = $html->find('#productName', 0);
            if ($test)
            {
                $item['title'] = trim($html->find('#productName', 0)->plaintext);
                $item['price'] = trim($html->find('#productPrices', 0)->plaintext);
                $item['cat'] = $html->find('#navBreadCrumb', 0)->plaintext;
                list($home, $item['cat'], $item['subcat'], $title) = explode("::", $item['cat']);
                $item['cat'] = str_replace("&nbsp;", "", $item['cat']);
                $item['subcat'] = str_replace("\n", "", str_replace("&nbsp;", "", $item['subcat']));
                $item['desc'] = trim($html->find('#productDescription', 0)->plaintext);
                $item['model'] = $html->find('ul#productDetailsList', 0)->find('li', 0)->plaintext;
                $item['model'] = explode(":", $item['model']);
                $item['model'] = trim($item['model'][1]);
                $item['manufacturer'] = $html->find('ul#productDetailsList', 0)->find('li', 1)->plaintext;
                $item['manufacturer'] = explode(":", $item['manufacturer']);
                $item['manufacturer'] = trim($item['manufacturer'][1]);
                foreach($html->find('img') as $img)
                {
                    if($img->alt == $item['title'])
                    {
                        $item['img_sm'] = $img->src;
                    }
                }

                $ret[] = $item;

            }
        }
        $html->clear();
        unset($html);
        unset($item);
        return $ret;
    }
    else
    {
        echo "Could not find page<br />";
    }
    unset($exists);
}

$i = 1;
$end = 9999999;

while($i < $end)
{
    $ret = scrape($i);

    if(isset($ret))
    {
        foreach($ret as $v)
        {
            $item['title'] = $v['title'];
            $item['price'] = $v['price'];
            $item['desc'] = $v['desc'];
            $item['model'] = $v['model'];
            $item['manufacturer'] = $v['manufacturer'];
            $item['image'] = $v['image'];
            $item['cat'] = $v['cat'];
            $item['subcat'] = $v['subcat'];
            $item['img_sm'] = $v['img_sm'];
        }
        unset($ret);
        unset($v);

        $sm_img_src = "http://xxxxxx/".$item['img_sm'];
        $ext = strrchr($item['img_sm'], '.');

        $filename = $item['model'] . $ext;

        $lg_img_src = "http://xxxxx/images/STC/".$filename;
        $new_sm = "./rip_images/small/{$filename}";
        $new_lg = "./rip_images/large/{$filename}";

        $item['image'] = $filename;

        save($lg_img_src,$new_lg);
        save($sm_img_src,$new_sm);

        //see if parent cat exists
        $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['cat']).'"';
        $result = $db->sql_query($sql);
        $parent = $db->sql_fetchrow($result);
        $db->sql_freeresult($result);
        // if not exists
        if($parent['cat_id'] == '')
        {
            //add the parent cat to the db
            $sql_ary = array(
                'cat_name' => $item['cat'],
                'cat_parent' => 0
            );
            $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
            $db->sql_query($sql);
            $cat_id = $db->sql_nextid();

            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add subcat to db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $cat_id
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }
        else //if parent cat exists
        {
            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add the subcat to the db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $parent['cat_id']
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }

        $sql_ary = array(
            'item_title'      => $db->sql_escape($item['title']),
            'item_price'     => $db->sql_escape($item['price']),
            'item_desc'      => $db->sql_escape($item['desc']),
            'item_model'    => $db->sql_escape($item['model']),
            'item_manufacturer' => $db->sql_escape($item['manufacturer']),
            'item_image'    => $db->sql_escape($item['image']),
            'item_cat'      => $db->sql_escape($item_cat)
        );

        $sql = 'INSERT INTO ' . SHOP_ITEM_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
        $db->sql_query($sql);
        garbage_collection();
        echo 'Done<br />';
    }
    $i++;
    unset($item);
}

?>

1 个答案:

答案 0 :(得分:0)

  1. 您是否查看了数据库库生成的实际查询字符串?
  2. 您是否添加了任何调试以查看查询是否成功?当查询调用失败时,大多数PHP db库都返回一个布尔值FALSE。假设查询成功,你就会致盲。