我正在使用CMSSW_11_1_4
环境(Python 3.8.4,uproot 3.11.3,尴尬的0.12.20),并且尝试打开一些非常大的.root文件(3个文件,每个9 GB)并使用事件循环遍历其某些分支。遍历第一个文件(前100k个事件)时,该代码似乎按预期工作。在第一个事件中需要花费大量时间,但随后的其他事件将很快得到处理。直到到达10万个已处理事件的第二个文件为止。然后,代码崩溃并打印出AssertionError
消息。这是一些代码,可以通过xrootd访问来重现该问题。
import uproot
import uproot_methods
import numpy as np
import matplotlib.pyplot as plt
# Get the file and import using uproot
base = 'root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/'
datasets = [
base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
]
# Load on lazy arrays
mycache = uproot.ArrayCache("1 GB")
events = uproot.lazyarrays(datasets, 'TreeMaker2/PreSelection', ['HT','CrossSection',
'Tracks.fCoordinates.fX',
'Tracks.fCoordinates.fY',
'Tracks.fCoordinates.fZ',
'Tracks_fromPV0',
'Tracks_matchedToPFCandidate'],
cache=mycache)
trk_multiplicity = np.zeros(events['Tracks.fCoordinates.fX'].size)
for ievt in range(events['Tracks.fCoordinates.fX'].size):
if ievt%1000 == 0:
print("Processing event %d. Progress: %.2f%%"%(ievt,100*ievt/events['Tracks.fCoordinates.fX'].size))
if events['HT'][ievt] < 1200:
continue
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
tracks_y = events['Tracks.fCoordinates.fY'][ievt]
tracks_z = events['Tracks.fCoordinates.fZ'][ievt]
tracks_E = np.sqrt(tracks_x**2+tracks_y**2+tracks_z**2+0.13957**2)
tracks = uproot_methods.TLorentzVectorArray.from_cartesian(tracks_x,
tracks_y,
tracks_z,
tracks_E)
tracks_fromPV0 = events['Tracks_fromPV0'][ievt]
tracks_matchedToPFCandidate = events['Tracks_matchedToPFCandidate'][ievt]
tracks = tracks[(tracks.pt > 1.) & (tracks.eta < 2.5) & (tracks_fromPV0 >= 2) &
(tracks_matchedToPFCandidate > 0)]
trk_multiplicity[ievt] = tracks.size
# Plot results
fig = plt.figure(figsize=(8,8))
ax = plt.gca()
CrossSection = events['CrossSection'][events['HT'] > 1200]
trk_multiplicity = trk_multiplicity[events['HT'] > 1200]
ax.hist(trk_multiplicity, bins=100, density=True, weights=CrossSection, histtype='step')
plt.show()
我确信这是一些内存问题,因为我已经设法以这种方式处理了多个文件(尽管文件较小并且本地存储)。我该如何克服呢?我需要以其他方式处理缓存吗?
编辑(2020年10月28日):我尝试切换到uproot4(0.0.27)/ awkward1(0.3.1)并没有成功。我将相同的代码转换为以下代码:
import uproot4 as uproot
import uproot_methods
import awkward1 as ak
import numpy as np
import matplotlib.pyplot as plt
# Get the file and import using uproot
base = 'root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/'
datasets = {
base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
}
# Load on lazy arrays
events = uproot.lazy(datasets)
trk_multiplicity = np.zeros(len(events['Tracks.fCoordinates.fX']))
for ievt in range(len(events['Tracks.fCoordinates.fX'])):
if ievt%1000 == 0:
print("Processing event %d. Progress: %.2f%%"%(ievt,100*ievt/len(events['Tracks.fCoordinates.fX'])))
if events['HT'][ievt] < 1200:
continue
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
tracks_y = events['Tracks.fCoordinates.fY'][ievt]
tracks_z = events['Tracks.fCoordinates.fZ'][ievt]
tracks_E = np.sqrt(tracks_x**2+tracks_y**2+tracks_z**2+0.13957**2)
tracks = uproot_methods.TLorentzVectorArray.from_cartesian(ak.to_awkward0(tracks_x),ak.to_awkward0(tracks_y),ak.to_awkward0(tracks_z),ak.to_awkward0(tracks_E))
tracks_fromPV0 = events['Tracks_fromPV0'][ievt]
tracks_matchedToPFCandidate = events['Tracks_matchedToPFCandidate'][ievt]
tracks = tracks[(tracks.pt > 1.) & (tracks.eta < 2.5) & (ak.to_awkward0(tracks_fromPV0) >= 2) &
(ak.to_awkward0(tracks_matchedToPFCandidate) > 0)]
trk_multiplicity[ievt] = tracks.size
# Plot results
fig = plt.figure(figsize=(8,8))
ax = plt.gca()
CrossSection = events['CrossSection'][events['HT'] > 1200]
trk_multiplicity = trk_multiplicity[events['HT'] > 1200]
ax.hist(trk_multiplicity, bins=100, density=True, weights=CrossSection, histtype='step')
plt.show()
这一次,代码虽然虽然慢得多(可能是较小的缓存?),却立即开始处理事件。同样,当它到达第二个文件时,代码这次崩溃并显示以下消息:
Traceback (most recent call last):
File "scripts/plotEventShapes_lazy.py", line 43, in <module>
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/awkward1/highlevel.py", line 974, in __getitem__
self._layout[where], self._behavior, cache=self._cache
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/awkward1/partition.py", line 366, in __getitem__
return PartitionedArray.from_ext(self._ext.getitem_at(where))
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 2017, in array
_ranges_or_baskets_to_arrays(
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 3264, in _ranges_or_baskets_to_arrays
uproot4.source.futures.delayed_raise(*obj)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/futures.py", line 46, in delayed_raise
raise exception_value.with_traceback(traceback)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 3189, in chunk_to_basket
basket = uproot4.models.TBasket.Model_TBasket.read(
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/model.py", line 730, in read
self.read_members(chunk, cursor, context, file)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/models/TBasket.py", line 230, in read_members
) = cursor.fields(chunk, _tbasket_format2, context)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/cursor.py", line 195, in fields
return format.unpack(chunk.get(start, stop, self, context))
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/chunk.py", line 370, in get
raise uproot4.deserialization.DeserializationError(
uproot4.deserialization.DeserializationError: while reading
TBasket version None as uproot4.models.TBasket.Model_TBasket (? bytes)
fNbytes: -1607368158
fObjlen: -1243566277
fDatime: 2634931141
fKeylen: -27664
fCycle: 21409
attempting to get bytes 483015:483033
outside expected range 510698:538758 for this Chunk
in file root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root
最后一条评论,我使用较小的文件(每个信号文件大约1 GB)运行相同的代码,但是这次使用xrootd访问并且循环没有问题。由于文件大小非常大,这可能是一个缓存问题吗?