我正在尝试使用elasticsearch python包为批量API索引文档。我从mySQL DB获取数据,该数据库有大约10000条记录。但是,我的Python批量api脚本只能上传5000条记录和在中间的某个地方,它正在休息。
我收到此错误 UnicodeDecodeError:'ascii'编解码器无法解码位置3中的字节0xc3:序号不在范围内(128)
def new_products(catid):
connection = get_connection()
es = get_elastic_connection()
cursor = connection.cursor()
catid = int(catid)
sql = "SELECT * FROM %s WHERE catid=%d AND product_id<>0 LIMIT %d" % (TABLENAME, catid, LIMIT_PER_THREAD_ON_NEW)
cursor.execute(sql)
product_ids_result = cursor.fetchall()
product_ids_only = map(lambda x: x['product_id'], product_ids_result)
product_ids_indexes = {}
for row in product_ids_result:
product_ids_indexes[row['product_id']] = row['id']
products_list = []
if product_ids_only:
sql = "SELECT * FROM tbl_products WHERE catid=%d AND product_id IN (%s)" % (catid, ','.join(map(str, product_ids_only)))
cursor.execute(sql)
products_list = cursor.fetchall()
while products_list:
print catid, len(products_list)
product_ids_from_db = map(lambda x: x['pid'], products_list)
product_images = get_images(product_ids_from_db)
product_specs = get_specs(catid, product_ids_from_db)
bulk_data = []
for row in products_list:
row['p_spec'] = {'d_spec': [], 'f_spec': []}
if row['pid'] in product_specs:
if product_specs[row['pid']].has_key('d_spec'):
row['p_spec']['d_spec'] = product_specs[row['pid']]['d_spec']
if product_specs[row['pid']].has_key('f_spec'):
row['p_spec']['f_spec'] = product_specs[row['pid']]['f_spec']
if row['pid'] in product_images:
if product_images[row['pid']]:
row['pimg'] = product_images[row['pid']]
row['no_img'] = '1'
bulk_data.append({
"index": {
'_index': ES_INDEX,
'_type': ES_TYPE,
'_id': row['pid']
}
})
bulk_data.append(row)
if len(bulk_data) == ES_LIMIT_PER_REQUEST:
responses = es.bulk(index=ES_INDEX, body=bulk_data, refresh=True)
bulk_data = []
if len(bulk_data) > 0:
responses = es.bulk(index=ES_INDEX, body=bulk_data, refresh=True)
sql = "SELECT * FROM %s WHERE catid=%d AND product_id<>0 LIMIT %d" % (TABLENAME, catid, LIMIT_PER_THREAD_ON_NEW)
cursor.execute(sql)
new_product_ids_result = cursor.fetchall()
new_product_ids_only = map(lambda x: x['product_id'], new_product_ids_result)
if set(product_ids_only) == set(new_product_ids_only):
print catid, "new products are same"
break;
else:
product_ids_only = new_product_ids_only
if new_product_ids_only:
sql = "SELECT * FROM tbl_products WHERE catid=%d AND product_id IN (%s)" % (catid, ','.join(map(str, new_product_ids_only)))
cursor.execute(sql)
products_list = cursor.fetchall()
else:
products_list = []
connection.close()
任何线索在这里出了什么问题。
此致
答案 0 :(得分:0)
我遇到了这个问题。
实际上,我正在尝试使用多线程进行索引数据。因此,我在跑步时没有收到任何错误。
最后,我通过在mysqldb.connect函数中传递charset和use_unicode作为参数来修复。