Question

我是数据库新手，正在使用日末库存数据学习python3.7和mysql。我设法以编程方式将数据加载到数据库中。但是，我想避免插入重复的行。我正在逐行解析文本文件。

到目前为止，这是我的代码。

import pymysql
import pandas as pd
import sys

ticker_file = 'C:/testfile.txt'

# Read the text file and add , to the end of the line.
def fun_read_file(ticker_file):
    host = 'localhost'
    user = 'user'
    password = 'password'
    db = 'trading'
    with open(ticker_file, 'r') as f:
        for line in f:
            # Do something with 'line'
            stripped = line.strip('\n\r')
            value1,value2,value3,value4,value5,value6,value7 = stripped.split(',')
            print(value1,value2,value3,value4,value5,value6,value7)
            # Call the csv_to_mysql function
            csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)

def csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
    '''
    This function load a csv file to MySQL table according to
    the load_sql statement.
    '''
    load_sql = 'INSERT INTO asx (Symbol,Date,Open,High,Low,Close,Volume) VALUES (%s, %s, %s, %s, %s, %s, %s)'
    args = [value1, value2, value3, value4, value5, value6, value7]
    print('You are in csv_to_mysql')
    print(args)
    try:
        con = pymysql.connect(host=host,
                                user=user,
                                password=password,
                                db=db,
                                autocommit=True,
                                local_infile=1)
        print('Connected to DB: {}'.format(host))
        # Create cursor and execute Load SQL
        cursor = con.cursor()
        cursor.execute(load_sql, args)
        print('Successfully loaded the table from csv.')
        con.close()

    except Exception as e:
        print('Error: {}'.format(str(e)))
        sys.exit(1)

# Execution the script
fun_read_file(ticker_file)

这是表asx中的当前数据：

mysql> select * from asx;

+--------+------------+--------+--------+--------+--------+---------+
| Symbol | Date       | Open   | High   | Low    | Close  | Volume  |
+--------+------------+--------+--------+--------+--------+---------+
| 14D    | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 |  243779 |
| 14D    | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 |  243779 |
| 14D    | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 |  243779 |
| 14DO   | 2019-01-11 | 0.0700 | 0.0700 | 0.0700 | 0.0700 |       0 |
| 1AD    | 2019-01-11 | 0.2400 | 0.2400 | 0.2400 | 0.2400 |       0 |
| 1AG    | 2019-01-11 | 0.0310 | 0.0320 | 0.0310 | 0.0310 |  719145 |
| 1AL    | 2019-01-11 | 0.9100 | 0.9100 | 0.9100 | 0.9100 |       0 |
| 1ST    | 2019-01-11 | 0.0280 | 0.0280 | 0.0280 | 0.0280 |       0 |
| 3DP    | 2019-01-11 | 0.0500 | 0.0560 | 0.0500 | 0.0520 | 3919592 |
+--------+------------+--------+--------+--------+--------+---------+
9 rows in set (0.02 sec)

如您所见，数据的前三行都是重复的。我要导入大量这些文件，重复行的机会很高。有没有办法检查我要插入的行在表中还不存在？检查符号和日期值应足以确保该数据集的唯一性。但是我不确定如何实现这一目标。

预先感谢您的帮助。

为澄清起见添加：到目前为止非常感谢您的投入。

我已经阅读了主要的主要答复，并有关于它们的后续问题。我的理解是，主键在表中必须唯一。由于日末股票数据的性质，我可能会遇到以下几行。

+--------+------------+--------+--------+--------+--------+---------+
| Symbol | Date       | Open   | High   | Low    | Close  | Volume  |
+--------+------------+--------+--------+--------+--------+---------+
| 14D    | 2019-01-12 | 0.3000 | 0.4950 | 0.2950 | 0.4900 |  123456 |
| 14D    | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 |  243779 |
| 14D    | 2019-01-11 | 0.2950 | 0.2950 | 0.2750 | 0.2750 |  243779 |
| 14DO   | 2019-01-11 | 0.0700 | 0.0700 | 0.0700 | 0.0700 |       0 |
| 1AD    | 2019-01-11 | 0.2400 | 0.2400 | 0.2400 | 0.2400 |       0 |

如您所见，Symbol 14D每个日期都有一行。第1行中的数据有效。但是，第2行和第3行是重复的。我需要删除第2行或第3行，以保持表格的准确性。

在这种情况下，我还应该制作符号和日期主键吗？

Answer 1

我建议您阅读 INSERT IGNORE ，MySQL的 ON DUPLICATE KEY UPDATE 关键字，并查看 PRIMARY KEY 和 UNIQUE 约束。

以下是可以解决您问题的快速链接： Mysql Handling Duplicates

如果您还有问题，我可以回答。

Answer 2

我仍然是Python的初学者，但是我知道数据库。我要做的是首先执行SELECT查询，以验证MySQL表中是否存在具有给定Symbol和Date的记录，并且仅在SELECT返回0行时才执行INSERT。您还应该考虑将这两列用作该表的主键。这将确保不插入任何重复项（但是插入重复项可能会引发必须处理的异常）。

Answer 3

非常注意如何正确回答。我最终创建了一个名为check_row的新函数，并使用了一条select语句来检查该行是否已经存在。在此数据集中，我只需要检查表中的行是否已经包含value1（Symbol）和value2（Date），以保持数据的准确性。谢谢tutiplain向我指出这个方向。

query = 'SELECT COUNT(*) from asx WHERE Symbol = %s AND Date = %s'
args = [value1, str_query_value2]

这是下面的完整代码。

import pymysql
import pandas as pd
import sys

ticker_file = 'C:/test.txt'

# Read the text file and add , to the end of the line.
def fun_read_file(ticker_file):
    #load_sql = "LOAD DATA INFILE 'C:/test.txt' INTO TABLE asx FIELDS TERMINATED BY ',' LINES TERMINATED BY '\r\n';"
    host = 'localhost'
    user = 'user'
    password = 'password'
    db = 'trading'
    with open(ticker_file, 'r') as f:
        for line in f:
            # Do something with 'line'
            stripped = line.strip('\n\r')
            value1,value2,value3,value4,value5,value6,value7 = stripped.split(',')
            print(value1,value2,value3,value4,value5,value6,value7)
            # Call the check_row function
            check_row(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)

# Insert row into table         
def csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
    '''
    This function load a csv file to MySQL table according to
    the load_sql statement.
    '''
    load_sql = 'INSERT INTO asx (Symbol,Date,Open,High,Low,Close,Volume) VALUES (%s, %s, %s, %s, %s, %s, %s)'
    args = [value1, value2, value3, value4, value5, value6, value7]
    try:
        con = pymysql.connect(host=host,
                                user=user,
                                password=password,
                                db=db,
                                autocommit=True,
                                local_infile=1)
        print('Connected to DB: {}'.format(host))
        # Create cursor and execute Load SQL
        cursor = con.cursor()
        cursor.execute(load_sql, args)
        print('Successfully loaded the table from csv.')
        con.close()

    except Exception as e:
        print('Error: {}'.format(str(e)))
        sys.exit(1)

# Check for duplicate row before insertion into table
def check_row(host, user, password, db, value1, value2, value3, value4, value5, value6, value7):
    # Manipulate the value2 (date) string first 20190111 into 2019-01-11
    str_value2 = value2
    year = str_value2[:4]
    day = str_value2[-2:]
    month = str_value2[4:6]
    str_query_value2 = year + '-' + month + '-' + day
    print(str_query_value2)
    # Select statement to query whether row already exists
    query = 'SELECT COUNT(*) from asx WHERE Symbol = %s AND Date = %s'
    args = [value1, str_query_value2]
    try:
        con = pymysql.connect(host=host,
                                user=user,
                                password=password,
                                db=db,
                                autocommit=True,
                                local_infile=1)
        print('Connected to DB: {}'.format(host))
        # Create cursor and execute Load SQL
        cursor = con.cursor()
        cursor.execute(query, args)
        print('Successfully queried the asx table.')
        result = cursor.fetchall()
        print(result)
        # Fetchall method outputs a tuple. Access first item of the first tuple.
        int_result = result[0][0]
        print(int_result)
        con.close()

        if int_result >= 1:
            # Exit this function, 0 means clean exit, 1 means abort
            exit(0)
        else:
            # Call the csv_to_mysql function
            csv_to_mysql(host, user, password, db, value1, value2, value3, value4, value5, value6, value7)

    except Exception as e:
        print('Error: {}'.format(str(e)))
        sys.exit(1)

# Execution the script
fun_read_file(ticker_file)

如何在插入MySQL数据库之前以编程方式检查重复的行

3 个答案: