
时间:2018-11-20 15:21:17

标签: python sqlite sqlalchemy



import subprocess
from subprocess import call
import time
import pandas as pd
import numpy as np
from sqlalchemy.orm import sessionmaker
from sqlalchemy import func, distinct, text
from sqlalchemy.ext.hybrid import hybrid_method
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, create_engine, and_
import os

n_users = 1000
n_days = 60
n_domains = 100
all_users = ['user%d' % i for i in range(n_users)]
all_domains = ['domain%d' % i for i in range(n_domains)]
n_rows = n_users*n_days*n_domains

Base = declarative_base()

#file_path = '/home/local/CORVIL/lpuggini/Desktop/example.db'
file_path = '/data/misc/luca/example.db'
db_path = 'sqlite:///' + file_path

engine = create_engine(db_path)

def get_session():
    Session = sessionmaker(bind=engine)
    session = Session()
    return session

class DailyUserWebsite(Base):
    __tablename__ = 'daily_user_website'

    id = Column(Integer, primary_key=True)
    user = Column(String(600), index=True)
    domain = Column(String(600))
    time_secs = Column(Integer, index=True)

    def __repr__(self):
        return "DailyUserWebsite(user='%s', domain='%s', time_secs=%d)" % \
            (self.user, self.domain, self.time_secs)

def get_df_daily_data_per_users(users):
    session = get_session()
    query = session.query(DailyUserWebsite).filter(DailyUserWebsite.user.in_(users))
    df = pd.read_sql(query.statement, query.session.bind)
    return df

def create_db():
    if os.path.exists(file_path):

    session = get_session()
    batch_size = 10000
    n_iter = int(n_rows / batch_size) + 1
    for i in range(n_iter):
        print 'Building db iteration %d out of %d' % (i, n_iter)
        df = pd.DataFrame()
        df['user'] = np.random.choice(all_users, batch_size)
        df['domain'] = np.random.choice(all_domains, batch_size)
        df['time_secs'] = [x - x%(3600*24) for x in np.random.randint(0, 3600*24*60, batch_size)]
        df.to_sql('daily_user_website', engine, if_exists='append', index=False)

for i in range(20):
    users = np.random.choice(all_users, 200)
    t0 = time.time()
    df = get_df_daily_data_per_users(users)
    t1 = time.time()
    print 'it=', i, 'time taken to read %d rows %f ' % (df.shape[0],  t1-t0)
    if i % 5 == 0:
        print 'Clean cache'
        os.system("sync; echo 3 > /proc/sys/vm/drop_caches")


(samenv) probe686:/data/misc/luca # python db_test.py
it= 0 time taken to read 1089089 rows 8.058407 
Clean cache
it= 1 time taken to read 1099234 rows 104.352085 
it= 2 time taken to read 1087292 rows 8.189860 
it= 3 time taken to read 1077284 rows 8.176948 
it= 4 time taken to read 1057111 rows 7.980002 
it= 5 time taken to read 1075694 rows 8.144479 
Clean cache
it= 6 time taken to read 1117925 rows 106.357740 
it= 7 time taken to read 1124208 rows 8.523779 
it= 8 time taken to read 1083049 rows 8.368766 
it= 9 time taken to read 1112264 rows 9.233548 
it= 10 time taken to read 1098628 rows 8.316519 
Clean cache


1 个答案:

答案 0 :(得分:1)



it= 0 time taken to read 1105783 rows 11.066020
Clean cache
it= 1 time taken to read 1099260 rows 13.062347
it= 2 time taken to read 1068726 rows 10.604767
it= 3 time taken to read 1070079 rows 10.434219
it= 4 time taken to read 1100475 rows 10.958109
it= 5 time taken to read 1034391 rows 9.991699
Clean cache
it= 6 time taken to read 1099236 rows 13.377700
it= 7 time taken to read 1094796 rows 10.526707
it= 8 time taken to read 1099906 rows 10.890920
it= 9 time taken to read 1069589 rows 10.302807
it= 10 time taken to read 1088436 rows 10.584793
Clean cache
it= 11 time taken to read 1131625 rows 13.288320
it= 12 time taken to read 1086223 rows 10.701297
it= 13 time taken to read 1112169 rows 10.712140
it= 14 time taken to read 1106843 rows 10.654484
it= 15 time taken to read 1117185 rows 11.097051
Clean cache
it= 16 time taken to read 1069357 rows 12.611331
it= 17 time taken to read 1046969 rows 10.186048
it= 18 time taken to read 1094897 rows 10.683771
it= 19 time taken to read 1094560 rows 10.526286



from sqlalchemy.interfaces import PoolListener
class CustomListener(PoolListener):
    def connect(self, dbapi_con, con_record):
        dbapi_con.execute('PRAGMA mmap_size=5000000000')

engine = create_engine(db_path, listeners= [CustomListener()])

这可以提高整体性能,但是我们仍然可以看到页面清理程序的影响。 实际上,在/ proc // smaps中进行挖掘,我们可以看到在每次迭代中都删除了地图(尝试保留到会话中,但这还不够)

it= 0 time taken to read 1063352 rows 7.473585
Clean cache
it= 1 time taken to read 1070376 rows 10.780015
it= 2 time taken to read 1068626 rows 6.982098
it= 3 time taken to read 1099894 rows 6.931929
it= 4 time taken to read 1051480 rows 6.775727
it= 5 time taken to read 1110969 rows 7.078822
Clean cache
it= 6 time taken to read 1070102 rows 8.640006
it= 7 time taken to read 1088125 rows 6.982330
it= 8 time taken to read 1076214 rows 6.853842
it= 9 time taken to read 1087579 rows 6.734123
it= 10 time taken to read 1112686 rows 7.095536
Clean cache
it= 11 time taken to read 1088162 rows 8.219157
it= 12 time taken to read 1111611 rows 7.019901
it= 13 time taken to read 1087253 rows 7.008692
it= 14 time taken to read 1100205 rows 6.849916
it= 15 time taken to read 1119392 rows 6.935807
Clean cache
it= 16 time taken to read 1098322 rows 8.323868
it= 17 time taken to read 1069470 rows 6.718016
it= 18 time taken to read 1100043 rows 6.814802
it= 19 time taken to read 1093699 rows 6.844217

坚持mmap方法 SQLite使用的是SHARED_MAP,因此我们可以在侧面映射文件并在过程完成时将其保留。

with open(file_path, "r+b") as f:
    # memory-map the file, size 0 means whole file
    map_file = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)    
    for i in range(20):
        users = np.random.choice(all_users, 200)
        t0 = time.time()
        df = get_df_daily_data_per_users(users)
        t1 = time.time()
        print 'it=', i, 'time taken to read %d rows %f ' % (df.shape[0],  t1-t0)
        if i % 5 == 0:
            print 'Clean cache'
            os.system("sync; echo 3 > /proc/sys/vm/drop_caches")

我们现在得到的结果在页面清理之间更加一致 但是,因为我们没有锁定地图(无法通过Python锁定),所以我认为Linux仍然可以在理论上清理它们,但是由于我们已经引用了它,所以没有做得那么积极

it= 0 time taken to read 1082794 rows 7.559200
Clean cache
it= 1 time taken to read 1080642 rows 7.199970
it= 2 time taken to read 1076128 rows 6.815502
it= 3 time taken to read 1115979 rows 6.927436
it= 4 time taken to read 1047205 rows 6.393032
it= 5 time taken to read 1130465 rows 6.959300
Clean cache
it= 6 time taken to read 1070467 rows 6.824608
it= 7 time taken to read 1090999 rows 6.793943
it= 8 time taken to read 1087654 rows 6.556801
it= 9 time taken to read 1107030 rows 6.814893
it= 10 time taken to read 1095411 rows 6.607208
Clean cache
it= 11 time taken to read 1080498 rows 6.831488
it= 12 time taken to read 1105623 rows 6.745170
it= 13 time taken to read 1063738 rows 6.454814
it= 14 time taken to read 1148108 rows 6.983086
it= 15 time taken to read 1111111 rows 6.822750
Clean cache
it= 16 time taken to read 1093235 rows 6.682100
it= 17 time taken to read 1101043 rows 6.658754
it= 18 time taken to read 1095549 rows 6.498076
it= 19 time taken to read 1075400 rows 6.555085