CREATE TABLE hostname_table
(
id INT NOT NULL AUTO_INCREMENT,
hostname CHAR(65) NOT NULL,
interval_avg INT,
last_update DATETIME NOT NULL,
numb_updates INT,
PRIMARY KEY (id)
)
我有这个表,我将500-600k行数据导入其中。在写入数据库时,我不检查重复项,因为我想知道每个主机有多少重复项,并且我还想知道每个主机名更新之间的间隔。
hostname_table中的示例值:
id hostname interval_avg last_update numb_updates
1 www.host.com 60 2012-04-25 20:22:21 1
2 www.hostname.com 10 2012-04-25 20:22:21 5
3 www.name.com NULL 2012-04-25 20:22:21 NULL
4 www.host.com NULL 2012-04-25 20:22:26 NULL
5 www.host.com NULL 2012-04-25 20:22:36 NULL
我清理它时想要的样子:
id hostname interval_avg last_update numb_updates
1 www.host.com 25 2012-04-25 20:22:36 3
2 www.hostname.com 10 2012-04-25 20:22:21 5
3 www.name.com NULL 2012-04-25 20:22:21 NULL
对于像这样的庞大数据库,我不想发送太多查询来获得这个目标,但我相信3个查询是这样的操作的最小值(如果我错了,请纠正我)。每小时将有大约50万个新行,其中~50%或更多将是重复的,因此尽可能有效地去除那些重复项是至关重要的,同时仍然记录重复发生的次数和频率(hense the interval_avg和numb_update更新)。
这是一个三步问题,我希望这里的社区能伸出援助之手。
SOLVED。 在一两天的研究过程中,我优化了一部分94%,另一部分优化了~97%。我真的希望这有助于其他人寻找相同的解决方案。如果选择错误的解决方案,mySQL和大型数据库可能是一个大问题。 (我将last_update列从DATETIME更改为INT(10),然后我从格式化时间更改为时间戳作为最终解决方案中的值,以便能够获取max(last_update)和min(last_update)值
(感谢GolezTrol帮助解决部分问题)
答案 0 :(得分:4)
如果要按主机名聚合,则无法为主机名获取interval_avg和numb_updates的每个不同值。你的意思是SUM
还是AVG
他们?或者你只想保持最低身份证的价值?
在下面的查询中,我总结一下。
SELECT
MIN(id) as id,
hostname,
SUM(interval_avg) as total_interval_avg,
SUM(numb_updates) as total_numb_updates,
COUNT(*) as hostname_count
FROM
hostname_table
GROUP BY
hostname
在此之后,您需要使用interval_avg
和numb_updates
的正确值更新每个找到的ID。
之后,您需要删除此查询找不到的每个ID。
DELETE FROM hostname_table
WHERE
id NOT IN
(SELECT
MIN(id)
FROM
hostname_table
GROUP BY
hostname)
答案 1 :(得分:0)
1.选择所有min + max last_update,sum(interval_avg),sum(numb_update)和count(duplicates)foreach hostname
//This will get the interval_avg value
//(summarize is ok, since all except min(id) will be zero),
//give a count of how many duplicates there are per hostname,
//and will also summarize numb_updates
SELECT
MIN(id) as id,
hostname,
SUM(numb_updates) as total_numb_updates,
SUM(interval_avg) as total_interval_avg,
MAX(last_update) as last_update_max,
MIN(last_update) as last_update_min,
COUNT(*) as hostname_count
FROM
hostname_table
GROUP BY
hostname
HAVING
COUNT(*)>1
//Get all last_update from each duplicate hostname(including the original)
//Dont do this in a seperate query, you only need first+last+rowcount to figure
//out the interval average. It took me a while to realize this, so I tried many
//varieties with little success(took too long with +600k rows)
//
// --- I will include the solution I didn't go for, ---
// --- so others wont do the same mistake ---
//
// START DONT USE THIS
// 2.63sec @ 10000 rows
$sql = "SELECT
id,
".$db_table.".hostname,
last_update
FROM
".$db_table."
INNER JOIN (
SELECT
hostname,
COUNT(*)
FROM
".$db_table."
GROUP BY
hostname
HAVING
COUNT(*)>1
) as t2
ON
".$db_table.".hostname = t2.hostname";
$resource = mysql_query($sql,$con);
// END DONT USE THIS (below is a 94% improvement)
//
// START THIS IS BETTER, BUT DONT USE THIS
// 0.16 sec @ 10000 rows
//Select everything from the table
$sql = "SELECT id
FROM ".$db_table;
$resource = mysql_query($sql,$con);
$array_id_all = array();
while($assoc = mysql_fetch_assoc($resource)){
array_push($array_id_all, $assoc['id']);
}
//This will select the ID of all the hosts without duplicates
$sql = "SELECT
MIN(id) as id,
hostname
FROM
".$db_table."
GROUP BY
hostname
HAVING
COUNT(*)=1";
$resource = mysql_query($sql,$con);
$array_id_unique = array();
while($assoc = mysql_fetch_assoc($resource)){
array_push($array_id_unique, $assoc['id']);
}
$array_id_non_unique = array_diff($array_id_all, $array_id_unique);
$id_list_non_unique = implode(", ", $array_id_non_unique);
//Select everything from the table when the IDs are IN $id_list_non_unique
$sql = "SELECT *
FROM ".$db_table."
WHERE id IN (".$id_list_non_unique.")";
$resource = mysql_query($sql,$con);
$array_duplicates = array();
$i=0;
while($assoc = mysql_fetch_assoc($resource)){
$array_duplicates[$i] = array($assoc['id'], $assoc['hostname'], $assoc['interval_avg'], $assoc['last_update'], $assoc['numb_updates']);
$i++;
}
// END THIS IS BETTER, BUT DONT USE THIS
(感谢Nick Fortescue @ https://stackoverflow.com/a/877051/1248273)
2.update interval_avg in min(id),以min(id)更新numb_updates,使用max(id)中的值更新min(id)中的last_update
//update the interval_avg, last_update and numb_update value of the min(id)
//of each duplicate hostname.
// --- I will include the solution I didn't go for, ---
// --- so others wont do the same mistake ---
//
// START DONT USE THIS
// 167 secs @ 500k rows
UPDATE hostname_table
SET interval_avg = CASE id
WHEN 1 THEN 25
//etc
END,
last_update = CASE id
WHEN 1 THEN "2012-04-25 20:22:36"
//etc
END,
numb_update = CASE id
WHEN 1 THEN 3
//etc
END
WHERE id IN (1)
// END DONT USE THIS
//
// START USE THIS
// 5.75 secs @ 500k rows (96.6% improvement)
INSERT INTO hostname_table (id,interval_avg,last_update,numb_updates)
VALUES
('1','25','2012-04-25 20:22:36','3'),
//etc
ON DUPLICATE KEY UPDATE
interval_avg=VALUES(interval_avg),
last_update=VALUES(last_update),
numb_updates=VALUES(numb_updates)
// END USE THIS
(感谢Michiel de Mare @ https://stackoverflow.com/a/3466/1248273)
3.删除除min(id)
以外的所有重复项//delete all duplicates except min(id)
ALTER IGNORE TABLE hostname_table ADD UNIQUE (hostname)
ALTER TABLE hostname_table DROP INDEX hostname
(感谢GolezTrol在选择我需要的第一个信息方面做出了正确的推动)