我是数据库新手,正在使用日末库存数据学习python3.7和mysql。我设法以编程方式将数据加载到数据库中。但是,我想避免插入重复的行。我正在逐行解析文本文件。
到目前为止,这是我的代码。
import pymysql
import pandas as pd
import sys
ticker_file = 'C:/testfile.txt'
# Read the text file and add , to the end of the line.
def fun_read_file(ticker_file):
host = 'localhost'
user = 'user'
password = 'password'
db = 'trading'
with open(ticker_file, 'r') as f:
for line in f:
# Do something with 'line'
stripped = line.strip('\n\r')
value1,value2,value3,value4,value5,value6,value7 = stripped.split(',')
print(value1,value2,value3,value4,value5,value6,value7)
# Call the csv_to_mysql function
csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)
def csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
'''
This function load a csv file to MySQL table according to
the load_sql statement.
'''
load_sql = 'INSERT INTO asx (Symbol,Date,Open,High,Low,Close,Volume) VALUES (%s, %s, %s, %s, %s, %s, %s)'
args = [value1, value2, value3, value4, value5, value6, value7]
print('You are in csv_to_mysql')
print(args)
try:
con = pymysql.connect(host=host,
user=user,
password=password,
db=db,
autocommit=True,
local_infile=1)
print('Connected to DB: {}'.format(host))
# Create cursor and execute Load SQL
cursor = con.cursor()
cursor.execute(load_sql, args)
print('Successfully loaded the table from csv.')
con.close()
except Exception as e:
print('Error: {}'.format(str(e)))
sys.exit(1)
# Execution the script
fun_read_file(ticker_file)
这是表asx中的当前数据:
mysql> select * from asx;
+--------+------------+--------+--------+--------+--------+---------+
| Symbol | Date | Open | High | Low | Close | Volume |
+--------+------------+--------+--------+--------+--------+---------+
| 14D | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 | 243779 |
| 14D | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 | 243779 |
| 14D | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 | 243779 |
| 14DO | 2019-01-11 | 0.0700 | 0.0700 | 0.0700 | 0.0700 | 0 |
| 1AD | 2019-01-11 | 0.2400 | 0.2400 | 0.2400 | 0.2400 | 0 |
| 1AG | 2019-01-11 | 0.0310 | 0.0320 | 0.0310 | 0.0310 | 719145 |
| 1AL | 2019-01-11 | 0.9100 | 0.9100 | 0.9100 | 0.9100 | 0 |
| 1ST | 2019-01-11 | 0.0280 | 0.0280 | 0.0280 | 0.0280 | 0 |
| 3DP | 2019-01-11 | 0.0500 | 0.0560 | 0.0500 | 0.0520 | 3919592 |
+--------+------------+--------+--------+--------+--------+---------+
9 rows in set (0.02 sec)
如您所见,数据的前三行都是重复的。 我要导入大量这些文件,重复行的机会很高。 有没有办法检查我要插入的行在表中还不存在? 检查符号和日期值应足以确保该数据集的唯一性。但是我不确定如何实现这一目标。
预先感谢您的帮助。
为澄清起见添加: 到目前为止非常感谢您的投入。
我已经阅读了主要的主要答复,并有关于它们的后续问题。 我的理解是,主键在表中必须唯一。由于日末股票数据的性质,我可能会遇到以下几行。
+--------+------------+--------+--------+--------+--------+---------+
| Symbol | Date | Open | High | Low | Close | Volume |
+--------+------------+--------+--------+--------+--------+---------+
| 14D | 2019-01-12 | 0.3000 | 0.4950 | 0.2950 | 0.4900 | 123456 |
| 14D | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 | 243779 |
| 14D | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 | 243779 |
| 14DO | 2019-01-11 | 0.0700 | 0.0700 | 0.0700 | 0.0700 | 0 |
| 1AD | 2019-01-11 | 0.2400 | 0.2400 | 0.2400 | 0.2400 | 0 |
如您所见,Symbol 14D每个日期都有一行。第1行中的数据有效。但是,第2行和第3行是重复的。我需要删除第2行或第3行,以保持表格的准确性。
在这种情况下,我还应该制作符号和日期主键吗?
答案 0 :(得分:1)
我建议您阅读 INSERT IGNORE ,MySQL的 ON DUPLICATE KEY UPDATE 关键字,并查看 PRIMARY KEY 和 UNIQUE 约束。
以下是可以解决您问题的快速链接: Mysql Handling Duplicates
如果您还有问题,我可以回答。
答案 1 :(得分:1)
我仍然是Python的初学者,但是我知道数据库。我要做的是首先执行SELECT查询,以验证MySQL表中是否存在具有给定Symbol和Date的记录,并且仅在SELECT返回0行时才执行INSERT。您还应该考虑将这两列用作该表的主键。这将确保不插入任何重复项(但是插入重复项可能会引发必须处理的异常)。
答案 2 :(得分:0)
非常注意如何正确回答。 我最终创建了一个名为check_row的新函数,并使用了一条select语句来检查该行是否已经存在。在此数据集中,我只需要检查表中的行是否已经包含value1(Symbol)和value2(Date),以保持数据的准确性。 谢谢tutiplain向我指出这个方向。
query = 'SELECT COUNT(*) from asx WHERE Symbol = %s AND Date = %s'
args = [value1, str_query_value2]
这是下面的完整代码。
import pymysql
import pandas as pd
import sys
ticker_file = 'C:/test.txt'
# Read the text file and add , to the end of the line.
def fun_read_file(ticker_file):
#load_sql = "LOAD DATA INFILE 'C:/test.txt' INTO TABLE asx FIELDS TERMINATED BY ',' LINES TERMINATED BY '\r\n';"
host = 'localhost'
user = 'user'
password = 'password'
db = 'trading'
with open(ticker_file, 'r') as f:
for line in f:
# Do something with 'line'
stripped = line.strip('\n\r')
value1,value2,value3,value4,value5,value6,value7 = stripped.split(',')
print(value1,value2,value3,value4,value5,value6,value7)
# Call the check_row function
check_row(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)
# Insert row into table
def csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
'''
This function load a csv file to MySQL table according to
the load_sql statement.
'''
load_sql = 'INSERT INTO asx (Symbol,Date,Open,High,Low,Close,Volume) VALUES (%s, %s, %s, %s, %s, %s, %s)'
args = [value1, value2, value3, value4, value5, value6, value7]
try:
con = pymysql.connect(host=host,
user=user,
password=password,
db=db,
autocommit=True,
local_infile=1)
print('Connected to DB: {}'.format(host))
# Create cursor and execute Load SQL
cursor = con.cursor()
cursor.execute(load_sql, args)
print('Successfully loaded the table from csv.')
con.close()
except Exception as e:
print('Error: {}'.format(str(e)))
sys.exit(1)
# Check for duplicate row before insertion into table
def check_row(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
# Manipulate the value2 (date) string first 20190111 into 2019-01-11
str_value2 = value2
year = str_value2[:4]
day = str_value2[-2:]
month = str_value2[4:6]
str_query_value2 = year + '-' + month + '-' + day
print(str_query_value2)
# Select statement to query whether row already exists
query = 'SELECT COUNT(*) from asx WHERE Symbol = %s AND Date = %s'
args = [value1, str_query_value2]
try:
con = pymysql.connect(host=host,
user=user,
password=password,
db=db,
autocommit=True,
local_infile=1)
print('Connected to DB: {}'.format(host))
# Create cursor and execute Load SQL
cursor = con.cursor()
cursor.execute(query, args)
print('Successfully queried the asx table.')
result = cursor.fetchall()
print(result)
# Fetchall method outputs a tuple. Access first item of the first tuple.
int_result = result[0][0]
print(int_result)
con.close()
if int_result >= 1:
# Exit this function, 0 means clean exit, 1 means abort
exit(0)
else:
# Call the csv_to_mysql function
csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)
except Exception as e:
print('Error: {}'.format(str(e)))
sys.exit(1)
# Execution the script
fun_read_file(ticker_file)