SQLAlchemy解析一个大文件

时间:2018-01-05 09:35:50

标签: python sqlalchemy

如何使用SQLAlchemy将大文件(50-100GB)解析到我的数据库中?假设我有两张桌子。

import collections
import re
import Bio.SeqIO
import sqlalchemy
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy import Column, Float, Integer, String, Text, DateTime
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import ClauseElement

Base = declarative_base()

class Protein_sequence(Base):
    __tablename__ = 'protein_sequence'
    prot_seq_id = Column(Integer, primary_key=True)
    prot_seq = Column(Text, unique=True)
    protein_annotation = relationship('Protein', back_populates='protein_sequence')

class Protein(Base):
    __tablename__ = 'protein_annotation'
    prot_id = Column(Integer, primary_key=True)
    prot_seq_id = Column(Integer, ForeignKey('protein_sequence.prot_seq_id'))
    prot_acc = Column(Text, unique=True)
    prot_name = Column(Text)
    protein_sequence = relationship('Protein_sequence', back_populates='protein_annotation')

def parse_fasta(path, prot_db='unknown', taxon_name=None, taxon_id=None):
    """Parsing a fasta file (UniProt or NCBInr)."""

    prot = collections.OrderedDict()

    for record in Bio.SeqIO.parse(path, 'fasta'):

        prot['seq'] = str(record.seq)
        desc = record.description

        gi_num = re.findall('^gi\|([0-9]+)(?:\||\s|$)', desc)
        if gi_num:
            prot['prot_gi'] = gi_num[0]
        desc = re.sub('^gi\|([0-9]+)(?:\||\s|$)', '', desc)

        prot_db = re.findall('^([^|]+)\|', desc)
        if prot_db:
            prot_db = prot_db[0]
        prot['prot_db'] = prot_db

        prot_acc = re.findall('^[^|]+\|([^ ]+)', desc)[0]
        prot['prot_acc'] = prot_acc

        prot['prot_name'] = re.findall('^[^ ]+ (.+)', desc)[0]

        yield prot

def prot_db_from_fasta():
    """Create tables in SQLite database. Input fasta file."""

    db = 'sqlite:///proteomic.db'
    engine = sqlalchemy.create_engine(db)
    Base.metadata.create_all(engine)
    Session = sqlalchemy.orm.sessionmaker(bind=engine)
    session = Session()
    conn = engine.connect()

    p = 'prot.fasta'

    for prot in parse_fasta(p):

        # prot is dictionary storing info about protein
        protein_sequence = Protein_sequence(prot_seq=prot['seq'])
        session.add(protein_sequence)

        try:
            session.commit()
        except:
            session.rollback()

        # choose only columns in table protein_annotation
        cols = [c.name for c in Protein.__table__.columns]
        annotation = {key: prot[key] for key in cols if key in prot}

        annotation['prot_seq_id'] = protein_sequence.prot_seq_id
        protein_annotation = Protein(**annotation)
        session.add(protein_annotation)

    session.commit()
    conn.close()

# run function to insert data into database
prot_db_from_fasta()

问题是我需要有关序列ID的信息(用于注释表),同时将序列插入数据库,除非它已经存在。使用SQLAlchemyCore无济于事,问题是我在每个循环使用会话提交。它很慢。如果我将此脚本用于70MB大文件,则需要17秒。如果我使用sqlite3而不是SQLAlchemy,则只需0.3秒。

我知道在一个大型交易中插入数据会更好,但如何做到这一点。我不会回复序列ID来将它们用于我的蛋白质注释。

还有fasta文件的例子。

>gi|115646|sp|P02662.2|CASA1_BOVIN Alpha-S1-casein
MKLLILTCLVAVALARPKHPIKHQGLPQEVLNENLLRFFVAPFPEVFGKEKVNELSKDIGSESTEDQAME
DIKQMEAESISSSEEIVPNSVEQKHIQKEDVPSERYLGYLEQLLRLKKYKVPQLEIVPNSAEERLHSMKE
GIHAQQKEPMIGVNQELAYFYPELFRQFYQLDAYPSGAWYYVPLGTQYTDAPSFSDIPNPIGSENSEKTT
MPLW

>gi|115654|sp|P02663.2|CASA2_BOVIN Alpha-S2-casein
MKFFIFTCLLAVALAKNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRNANEEEYSIG
SSSEESAEVATEEVKITVDDKHYQKALNEINQFYQKFPQYLQYLYQGPIVLNPWDQVKRNAVPITPTLNR
EQLSTSEENSKKTVDMESTEVFTKKTKLTEEEKNRLNFLKKISQRYQKFALPQYLKTVYQHQKAMKPWIQ
PKTKVIPYVRYL

那么将数据插入数据库的好习惯是什么。

Picture of my proteomic database.

0 个答案:

没有答案