我想扫描超过100万行的大表,同时更新所有行的一列。
以下代码导致内存不足问题:
def main():
session = Session()
i = 0
for row in session.query(article).yield_per(100):
i = i + 1
print(row.id)
row.keywords = clean_tag(row.keywords)
if i % 100 == 0:
session.flush()
session.expunge_all()
session.commit()
按照惯例,flush()
将持久更改对象到数据库,expunge_all应删除这些对象。
出了什么问题?感谢。
在7.17编辑
根据univerio的建议,我在此处粘贴示例:
#!/usr/bin/env
# coding=utf-8
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, BigInteger, String
from sqlalchemy.orm import sessionmaker
# engine = create_engine('mysql://root:123456@192.168.0.202/toutiao')
engine = create_engine('mysql://root:MyNewPass4!@192.168.3.220/toutiao')
Session = sessionmaker(bind=engine, autoflush=True)
Base = declarative_base()
class article(Base):
# __tablename__ = 'ss_article_group'
__tablename__ = 'article100'
id = Column(BigInteger, primary_key=True)
keywords = Column(String)
def clean_tag(tag):
r"""
>>> clean_tag('a,b\nc d')
'a,b,c,d'
>>> clean_tag('\na,b\n\n')
'a,b'
>>> clean_tag('a,b,')
'a,b'
>>> clean_tag(',')
"""
if tag is None:
return False
tags = tag.split()
new_tag = ','.join(tags)
new_tag = new_tag.strip(',')
if new_tag == '':
return None
if new_tag == tag:
return False
return new_tag
def main():
session = Session()
i = 0
for row in session.query(article).yield_per(100):
i = i + 1
print(row.id)
new_keywords = clean_tag(row.keywords)
if new_keywords != False:
row.keywords = new_keywords
if i % 100 == 0:
session.flush()
session.expunge_all()
session.commit()
if __name__ == '__main__':
main()