我有一个如下所示的数据集:
class Release(db.Model):
__tablename__ = 'releases'
id = db.Column(db.Integer, primary_key=True)
platform_id=db.Column(db.Integer, db.ForeignKey('platforms.id'))
name = db.Column(db.String(20), unique=True)
builds = db.relationship('ReleaseBuilds', cascade='all,delete', lazy='dynamic', order_by="desc(ReleaseBuilds.date_created)")
class ReleaseBuilds(db.Model):
__tablename__='release_builds'
id = db.Column(db.Integer, primary_key=True)
release_id = db.Column(db.Integer, db.ForeignKey('releases.id'))
name = db.Column(db.String(150), nullable=False)
artifacts = db.relationship('ReleaseBuildArtifacts', cascade='all,delete', backref='builds', lazy='dynamic')
deployments = db.relationship('Deployments', cascade='all,delete', lazy='dynamic')
tests = db.relationship('Test', cascade='delete', lazy='dynamic')
class ReleaseBuildArtifacts(db.Model):
__tablename__='release_build_artifacts'
id = db.Column(db.Integer, primary_key=True)
release_build_id = db.Column(db.Integer, db.ForeignKey('release_builds.id'))
application_id = db.Column(db.Integer, db.ForeignKey('applications.id'))
rpm = db.Column(db.String(300))
build = db.relationship('ReleaseBuilds')
application = db.relationship('Application')
class Deployments(db.Model):
__tablename__ = 'deployments'
release_build_id = db.Column(db.Integer, db.ForeignKey('release_builds.id'), primary_key=True)
environment_id = db.Column(db.Integer, db.ForeignKey('environments.id'), primary_key=True)
date_deployed = db.Column(db.DateTime(timezone=False), default=datetime.datetime.utcnow)
environment = db.relationship('Environment', foreign_keys=[environment_id])
class TestType(db.Model):
__tablename__ = 'test_types'
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(50), unique=True)
class Test(db.Model):
__tablename__ = 'tests'
id = db.Column(db.Integer, primary_key=True)
release_build_id = db.Column(db.Integer, db.ForeignKey('release_builds.id'), nullable=False)
environment_id = db.Column(db.Integer, db.ForeignKey('environments.id'), nullable=False)
test_type_id = db.Column(db.Integer, db.ForeignKey('test_types.id'))
name = db.Column(db.String(300))
environments = db.relationship('Environment', foreign_keys=[environment_id])
results = db.relationship('TestResult', cascade='all,delete', lazy='dynamic')
__table_args__ = (
ForeignKeyConstraint(['release_build_id', 'environment_id'],['deployments.release_build_id', 'deployments.environment_id']),
)
class TestResult(db.Model):
__tablename__ = 'test_results'
id = db.Column(db.Integer, primary_key=True)
test_id = db.Column(db.Integer, db.ForeignKey('tests.id'), nullable=False)
name = db.Column(db.String(500))
passed = db.Column(db.Boolean)
可能有双重日期,日期之间可能存在差距(缺少日期)
每天我想计算过去30天内发生的行数(个案数)。
我成功地通过以下方式做到了这一点:
date, brandname, status, case number
2017-01-01, x1, closed, 12345
2017-01-01, x2, closed, 12345
2017-01-01, x3, closed, 12345
2017-01-02, x4, open, 7864
2017-01-03, x5, open, 78642
...
这给了我所需的确切结果:
dataframe <-
structure(list(date = structure(c(17167, 17167, 17167, 17168,
17169), class = "Date"), brandname = c("x1", "x2", "x3", "x4",
"x5"), status = c("closed", "closed", "closed", "open", "open"
), `case number` = c(12345L, 12345L, 12345L, 7864L, 78642L)), .Names = c("date",
"brandname", "status", "case number"), row.names = c(NA, -5L), class = "data.frame")
for(i in 1:nrow(dataframe)) {
frame <- subset(dataframe, date > dataframe$date[i] - 30L &
date < dataframe$date[i])
dataframe$count[i] <- nrow(frame)
}
但我相信有更有效的方法可以做到这一点。有人可以帮忙吗?
答案 0 :(得分:0)
这可以使用 non-equi join 来解决:
library(data.table)
DT <- as.data.table(dataframe)
DT[, count := DT[DT[, .(date, dm30 = date - 30)], on = .(date >= dm30, date < date),
.N, by = .EACHI]$N][]
date brandname status case number count 1: 2017-01-01 x1 closed 12345 0 2: 2017-01-01 x2 closed 12345 0 3: 2017-01-01 x3 closed 12345 0 4: 2017-01-02 x4 open 7864 3 5: 2017-01-03 x5 open 78642 4
解决方案主要包括三个步骤:
DT[, .(date, dm30 = date - 30)]
创建一个帮助表,其中包含要聚合的日期范围。 DT
正确连接,同时按连接条件进行聚合。 .N
是一个特殊符号,包含每个组中的行数。DF
。答案 1 :(得分:0)
这是一个解决方案:它与问题略有不同。此版本使用数据框中的日期来选择另一个数据框中的子集。
对于日期范围中的每个日期,它计算另一个数据帧中的出现次数(每个日期和日期之间 - 30天)。然后,计数结果将存储在连续日期列表旁边的新列中。
数据:强>
-Dataframe-
date
1 2016-07-12
2 2016-08-03
3 2016-08-24
4 2016-09-27
...
265 2017-09-27
266 2017-09-28
267 2017-09-28
268 2017-09-28
<强>代码:强>
churn_frame <- as.data.frame(dates) #this is a sequential list of dates (calendar)
churn_frame$new <- sapply(churn_frame$dates, function(x){
sum(dataframe$date <= x & dataframe$date >= x - 29)} #-29 days to cover exactly 30 days.
)
<强>结果:强>
date new
1 2017-07-31 10
2 2017-08-01 10
3 2017-08-02 11
4 2017-08-03 10
5 2017-08-04 12
...
58 2017-09-26 11
59 2017-09-27 12
60 2017-09-28 14
61 2017-09-29 12