经过研究并尝试了一个月的时间之后,我将在这里寻求有关我的大数据更新问题的帮助。
我正在尝试定期从数据库中的youtube API更新频道数据。我的“频道”表中有超过1500万的数据,每天需要使用Google youtube api进行更新。问题是更新过程真的很慢。我尝试了不同的方法。
Try#1-我正在使用php curl multi exec在后台进行该过程。每次我从数据库调用500通道并尝试定期更新这些数据时。它仍然需要太多时间。
Try#2我认为速度较慢是由于mysql查询的事务时间较长,所以我决定在重复的键更新查询中使用mysql立即更新500个通道数据。但是文件执行得很好,但是更新过程花费了太多时间。它不会一次更新500。并立即更新2/3。
Try#3我已经使用multi curl exec来执行多个文件,并且它们是并行方式,分别调用500个通道并尝试并行更新。仍然无法正常工作。仅更新500个频道就花费了超过10分钟的时间。
我在这里添加了我的代码。
Cron卷曲文件。我正在使用cron作业执行常规任务,并使用一个cron尝试并行调用50个更新通道功能。这是我的cron-curl.php
<?php
$start = microtime(true);
$mh = curl_multi_init();
$handles = array();
$offset=0;
for ($i = 1; $i < 2; $i++) {
$ch = curl_init();
$rand = rand(5, 50); // just making up data to pass to script
$url = "http://mydomain/api/cron".$i.".php?id=".$i."&offset=".$offset;
echo $url;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_multi_add_handle($mh, $ch);
$handles[] = $ch;
$offset=$offset+25000;
}
// execute requests and poll periodically until all have completed
$isRunning = null;
do {
curl_multi_exec($mh, $isRunning);
//usleep(20000);
} while ($isRunning > 0);
// fetch output of each request
$outputs = array();
for ($i = 0; $i < count($handles); $i++) {
$outputs[$i] = trim(curl_multi_getcontent($handles[$i]));
curl_multi_remove_handle($mh, $handles[$i]);
}
curl_multi_close($mh);
$time = microtime(true) - $_SERVER["REQUEST_TIME_FLOAT"];
echo "Process Time: {$time}" . "<br>";
print_r($outputs);
printf("Elapsed time: %.2f seconds\n", microtime(true) - $start);
?>
这是运行更新过程并且curl试图在后台调用的代码
cron1.php / cron2.php / cron3.php
<?php $offset = $_GET['offset'];
$monit_id = $_GET['id'];
$less_id = $offset + 18000;
define('__ROOT__', dirname(dirname(__FILE__)));
$hostname = "localhost";
$host = "http://v3.domain.com/api/";
$db_name = "****";
$db_user = "****";
$db_password = "****";
$date = strtotime("now");
$ip = $_SERVER['REMOTE_ADDR'];
define('HOST_NAME', $hostname);
define('BASE_URL', $host);
try {
$db = new PDO("mysql:host=" . HOST_NAME . ";dbname=$db_name", $db_user, $db_password);
} catch (PDOException $e) {
echo $e->getMessage();
}
function get_content($url)
{
global $db;
$sql = "SELECT * FROM proxy_settings";
$stmt = $db->prepare($sql);
$stmt->execute();
$result = $stmt->fetchAll();
$curl = curl_init();
$header[0] = "Accept: text/xml";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_PROXY, $result[0]['ip']);
curl_setopt($curl, CURLOPT_PROXYPORT, "13012");
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_HTTPPROXYTUNNEL, 0);
curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
curl_setopt($curl, CURLOPT_ENCODING, '');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_COOKIE, 1);
curl_setopt($curl, CURLOPT_COOKIEJAR, 1);
curl_setopt($curl, CURLOPT_COOKIEFILE, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 10);
$html = curl_exec($curl);
if (curl_error($curl)) {
return 0;
}
//print_r($html);
if (curl_getinfo($curl, CURLINFO_HTTP_CODE) != 200)
$html = null;
curl_close($curl);
$first_step = explode('"ptk":"', $html);
$second_step = explode('"', $first_step[1]);
return $second_step[0];
}
if (!file_exists(__DIR__ . '/vendor/autoload.php')) {
throw new \Exception('please run "composer require google/apiclient:~2.0" in "' . __DIR__ . '"');
}
require_once __DIR__ . '/vendor/autoload.php';
// This code will execute if the user entered a search query in the form
// and submitted the form. Otherwise, the page displays the form above.
/*
* Set $DEVELOPER_KEY to the "API key" value from the "Access" tab of the
* {{ Google Cloud Console }} <{{ https://cloud.google.com/console }}>
* Please ensure that you have enabled the YouTube Data API for your project.
*/
$api_key = "SELECT api_key FROM api_settings WHERE type='api'";
$stmt = $db->query($api_key);
$key = $stmt->fetchall();
$api = $key[0]['api_key'];
$DEVELOPER_KEY = $api;
//$DEVELOPER_KEY = '*********************';
$client = new Google_Client();
$client->setDeveloperKey($DEVELOPER_KEY);
// Define an object that will be used to make all API requests.
$youtube = new Google_Service_YouTube($client);
$htmlBody = '';
try {
// Call the search.list method to retrieve results matching the specified
// query term.
$datet = date('Y-m-d');
function update_data($total_view, $update_data)
{
global $db;
$query = "INSERT INTO channel (id, views_per_video,total_subs,yt_update,yt_update_no) VALUES " . implode(', ', $update_data) . " ON DUPLICATE KEY UPDATE views_per_video = VALUES(views_per_video),total_subs = VALUES(total_subs),yt_update = VALUES(yt_update),yt_update_no = VALUES(yt_update_no)";
$sth = $db->prepare($query);
$sth->execute();
}
function get_data_from_table($table = NULL, $unumber, $monit_id, $less_id, $offset)
{
global $db;
$datet = date('Y-m-d');
$sql = "SELECT * FROM $table WHERE yt_update_no < $unumber AND id<$less_id LIMIT 500 OFFSET $offset";
//print_r($sql.'<br>');
//exit;
$stmt = $db->prepare($sql);
$stmt->execute();
$result = $stmt->fetchAll();
if ($stmt->rowCount() > 0) {
return $result;
} else {
$newnumbers = $unumber + 1;
$sql = "UPDATE update_monitor SET number = '$newnumbers' WHERE id = '$monit_id'";
$pdo = $db->prepare($sql);
$pdo->execute();
}
}
$d = $db->prepare("SELECT * FROM update_monitor where id='$monit_id'");
$d->execute();
$unumber_res = $d->fetchAll();
$unumber = $unumber_res[0]['number'];
$unumbers = $unumber_res[0]['number'];
$result = get_data_from_table('channel', $unumber, $monit_id, $less_id, $offset);
//$pieces = array_chunk($result, ceil(count($result) / 50));
//list($array1, $array2,$array3) = array_chunk($result, ceil(count($result) / 3));
echo "
<pre>";
$total_view = array();
$total_subscriber = array();
$date = array();
$ids = array();
$categories = array();
$view_per_videos = array();
$update_data = array();
$last_video_days = array();
$unumberss = array();
//print_r($result);exit;
if ($result) {
foreach ($result as $searchResult) {
$id = $searchResult['channel_id'];
$row_id = $searchResult['id'];
$id_without_uc = substr($id, 2);
$channels = $youtube->channels->listChannels("statistics,snippet,contentDetails", array(
'id' => $id,
));
//echo "<br>";
if (!empty($channels['items'])) {
foreach ($channels as $channel) {
$total_subscribers = $channel['statistics']['subscriberCount'];
$title = $channel['snippet']['title'];
$playlist_id = $channel['contentDetails']['relatedPlaylists']['uploads'];
}
$videos = $youtube->playlistItems->listPlaylistItems(
'snippet',
array('playlistId' => $playlist_id, 'maxResults' => 7)
);
// print_r($videos);exit;
if (!empty($videos['items'])) {
if (count($videos['items']) > 0) {
$total_result = count($videos['items']);
} else {
$total_result = 1;
}
$video_ids = array();
$view_counts = 0;
foreach ($videos as $video) {
$video_ids[] = $video['snippet']['resourceId']['videoId'];
}
$video_info = $youtube->videos->listVideos(
'statistics,snippet',
array('id' => implode(", ", $video_ids))
);
// print_R($video_info);exit;
foreach ($video_info as $video) {
$view_counts += $video['statistics']['viewCount'];
}
$view_per_video = $view_counts / $total_result;
/*$videw_result = $youtube->search->listSearch('snippet', array(
'maxResults' => 1,
'channelId' => $id,
'order' => 'date',
));
//print_r($videw_result);
//exit;
foreach ($videw_result['items'] as $result) {
$last_upload_Date = $result['snippet']['publishedAt'];
$now = time();
$past_date = strtotime($last_upload_Date);
$datediff = $now - $past_date;
$last_video_day = floor($datediff / (60 * 60 * 24));
//print_r($last_video_day);
// exit;
// $tags = get_meta_tags("http://www.youtube.com/watch?v=" . $result['id']['videoId']);
//$url="http://www.youtube.com/watch?v=" . $result['id']['videoId'];
// $network = get_content($url);
//print_r($data);
//exit;
// if (isset($network)) {
// if ($id_without_uc == $network) {
// $network = "Adsense";
// } else {
// $network = $network;
// }
// } else {
// $network = '';
// }
}
*/
//print_r($tags);
//exit;
$category = array(
"2" => "Autos & Vehicles",
"1" => "Film & Animation",
"10" => "Music",
"15" => "Pets & Animals",
"17" => "Sports",
"18" => "Short Movies",
"19" => "Travel & Events",
"20" => "Gaming",
"21" => "Videoblogging",
"22" => "People & Blogs",
"23" => "Comedy",
"24" => "Entertainment",
"25" => "News & Politics",
"26" => "Howto & Style",
"27" => "Education",
"28" => "Science & Technology",
"29" => "Nonprofits & Activism",
"30" => "Movies",
"31" => "Anime/Animation",
"32" => "Action/Adventure",
"33" => "Classics",
"34" => "Comedy",
"35" => "Documentary",
"36" => "Drama",
"37" => "Family",
"38" => "Foreign",
"39" => "Horror",
"40" => "Sci-Fi/Fantasy",
"41" => "Thriller",
"42" => "Shorts",
"43" => "Shows",
"44" => "Trailers",
);
// print_r( $categories[$cat_id]);exit;
// $categories[$row_id] = $category[$cat_id];
// $total_subscriber[$row_id] = $total_subscribers;
// $date[] = $datet;
// $ids[] = $row_id;
// $data[$row_id] = $datet;
// $total_view[$id] = $total_views;
// $last_video_days[] = 0;
//$unumberss[$row_id] = $unumbers;
$datet = date('Y-m-d');
//$last_30_day_views[$row_id]=$last_30_day_view;
// $view_per_videos[$row_id] = array('a'=>$view_per_video,'b'=>$category[$cat_id],'c'=>$total_subscribers,'d'=>$total_views);
// $update_data[$row_id]=array('a'=>$view_per_video,'b'=>$category_name,'c'=>$total_subscribers,'d'=>$total_views,'e'=>$datet,'f'=>$unumbers,'g'=>$last_30_day_view);
//$update_data[$row_id]=array('a'=>$view_per_video,'c'=>$total_subscribers,'e'=>$datet,'f'=>$unumbers);
$update_data[] = "($row_id,'$view_per_video','$total_subscribers','$datet',$unumbers)";
//print_r($update_data);exit;
} else {
$update_data = $update_data;
$sql = "UPDATE channel SET yt_update_no='$unumbers' WHERE channel_id='$id'";
$pdo = $db->prepare($sql);
$pdo->execute();
}
} else {
$sql = "DELETE FROM `channel` WHERE channel_id='$id'";
$stmt = $db->prepare($sql);
//$result = $stmt->execute();
$stmt->execute();
}
}
//print_r($update_data);exit;
update_data($total_view, $update_data);
}
$time = microtime(true) - $_SERVER["REQUEST_TIME_FLOAT"];
echo "Process Time: {$time}";
} catch (Google_Service_Exception $e) {
$htmlBody .= sprintf('<p>A service error occurred: <code>%s</code></p>', htmlspecialchars($e->getMessage()));
} catch (Google_Exception $e) {
$htmlBody .= sprintf('<p>An client error occurred: <code>%s</code></p>', htmlspecialchars($e->getMessage()));
}
exit;
?>
能帮您解决这个问题吗?如何在一天之内更新所有15M频道?
谢谢!