Question

我的Flask应用程序带有RESTful API。其中一个API调用是带有JSON有效负载的“大量upsert”调用。我在努力表现。

我尝试的第一件事是在Query对象上使用merge-result，因为......

这是一个优化的方法，它将合并所有映射的实例，保留结果行和未映射列的结构，其方法开销少于为每个值显式调用Session.merge（）的方法开销。

这是初始代码：

class AdminApiUpdateTasks(Resource):

    """Bulk task creation / update endpoint"""

    def put(self, slug):
        taskdata = json.loads(request.data)
        existing = db.session.query(Task).filter_by(challenge_slug=slug)
        existing.merge_result(
            [task_from_json(slug, **task) for task in taskdata])
        db.session.commit()
        return {}, 200

对该端点的请求包含〜5000条记录，所有这些记录都已存在于数据库中，需要返回的时间超过11米：

real    11m36.459s
user    0m3.660s
sys 0m0.391s

由于这是一个相当典型的用例，我开始研究提高性能的替代方案。根据我更好的判断，我尝试merge每个记录的会话：

class AdminApiUpdateTasks(Resource):

    """Bulk task creation / update endpoint"""

    def put(self, slug):
        # Get the posted data
        taskdata = json.loads(request.data)
        for task in taskdata:
           db.session.merge(task_from_json(slug, **task))
        db.session.commit()
        return {}, 200

令我惊讶的是，结果的速度是原来的两倍：

real    4m33.945s
user    0m3.608s
sys 0m0.258s

我有两个问题：

为什么使用merge的第二个策略比使用merge_result的所谓优化的第一个策略更快？
如果有的话，我应该采取哪些其他策略来进一步优化？

Answer 1

我认为这导致你在第一个查询中的缓慢：

existing = db.session.query(Task).filter_by(challenge_slug=slug)

你也应该改变这个：

    existing.merge_result(
        [task_from_json(slug, **task) for task in taskdata])

要：

    existing.merge_result(
        (task_from_json(slug, **task) for task in taskdata))

因为这样可以节省一些内存和时间，因为在将内容发送到merge_result方法之前，列表不会在内存中生成。

Answer 2

这是一个古老的问题，但是我希望这个答案仍然可以对人们有所帮助。

我使用了与SQLAlchemy设置的this示例相同的想法，但是我添加了执行UPSERT（如果存在则插入，否则更新现有记录）操作的基准测试。我将结果添加到下面的PostgreSQL 11数据库中：

Tests to run: test_customer_individual_orm_select, test_customer_batched_orm_select, test_customer_batched_orm_select_add_all, test_customer_batched_orm_merge_result
test_customer_individual_orm_select : UPSERT statements via individual checks on whether objects exist and add new objects individually (10000 iterations); total time 9.359603 sec
test_customer_batched_orm_select : UPSERT statements via batched checks on whether objects exist and add new objects individually (10000 iterations); total time 1.553555 sec
test_customer_batched_orm_select_add_all : UPSERT statements via batched checks on whether objects exist and add new objects in bulk (10000 iterations); total time 1.358680 sec
test_customer_batched_orm_merge_result : UPSERT statements using batched merge_results (10000 iterations); total time 7.191284 sec

如您所见，merge-result远非最有效的选择。我建议分批检查结果是否存在并应更新。希望这会有所帮助！

"""
This series of tests illustrates different ways to UPSERT
or INSERT ON CONFLICT UPDATE a large number of rows in bulk.
"""
from sqlalchemy import Column
from sqlalchemy import create_engine
from sqlalchemy import Integer
from sqlalchemy import String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from profiler import Profiler


Base = declarative_base()
engine = None


class Customer(Base):
  __tablename__ = "customer"
  id = Column(Integer, primary_key=True)
  name = Column(String(255))
  description = Column(String(255))


Profiler.init("bulk_upserts", num=100000)


@Profiler.setup
def setup_database(dburl, echo, num):
  global engine
  engine = create_engine(dburl, echo=echo)
  Base.metadata.drop_all(engine)
  Base.metadata.create_all(engine)

  s = Session(engine)
  for chunk in range(0, num, 10000):
    # Insert half of the customers we want to merge
    s.bulk_insert_mappings(
      Customer,
      [
        {
          "id": i,
          "name": "customer name %d" % i,
          "description": "customer description %d" % i,
        }
        for i in range(chunk, chunk + 10000, 2)
      ],
    )
  s.commit()


@Profiler.profile
def test_customer_individual_orm_select(n):
  """
  UPSERT statements via individual checks on whether objects exist
  and add new objects individually
  """
  session = Session(bind=engine)
  for i in range(0, n):
    customer = session.query(Customer).get(i)
    if customer:
      customer.description += "updated"
    else:
      session.add(Customer(
          id=i,
          name=f"customer name {i}",
          description=f"customer description {i} new"
      ))
    session.flush()
  session.commit()

@Profiler.profile
def test_customer_batched_orm_select(n):
  """
  UPSERT statements via batched checks on whether objects exist
  and add new objects individually
  """
  session = Session(bind=engine)
  for chunk in range(0, n, 1000):
    customers = {
        c.id: c for c in
        session.query(Customer)\
            .filter(Customer.id.between(chunk, chunk + 1000))
    }
    for i in range(chunk, chunk + 1000):
      if i in customers:
        customers[i].description += "updated"
      else:
        session.add(Customer(
            id=i,
            name=f"customer name {i}",
            description=f"customer description {i} new"
        ))
    session.flush()
  session.commit()

@Profiler.profile
def test_customer_batched_orm_select_add_all(n):
  """
  UPSERT statements via batched checks on whether objects exist
  and add new objects in bulk
  """
  session = Session(bind=engine)
  for chunk in range(0, n, 1000):
    customers = {
        c.id: c for c in
        session.query(Customer)\
            .filter(Customer.id.between(chunk, chunk + 1000))
    }
    to_add = []
    for i in range(chunk, chunk + 1000):
      if i in customers:
        customers[i].description += "updated"
      else:
        to_add.append({
            "id": i,
            "name": "customer name %d" % i,
            "description": "customer description %d new" % i,
        })
    if to_add:
      session.bulk_insert_mappings(
        Customer,
        to_add
      )
      to_add = []
    session.flush()
  session.commit()

@Profiler.profile
def test_customer_batched_orm_merge_result(n):
  "UPSERT statements using batched merge_results"
  session = Session(bind=engine)
  for chunk in range(0, n, 1000):
    customers = session.query(Customer)\
        .filter(Customer.id.between(chunk, chunk + 1000))
    customers.merge_result(
      Customer(
          id=i,
          name=f"customer name {i}",
          description=f"customer description {i} new"
      ) for i in range(chunk, chunk + 1000)
    )
    session.flush()
  session.commit()

为SQLAlchemy批量upsert寻找更好的策略

2 个答案: