Closed amaltaro closed 6 years ago
@vlimant Seangchan just noticed that the missing run/lumi information happened on the output file. I haven't checked the FJR yet, but most likely those info are missing there too.
This is weird. This job is merge job (684921) Here is couch record for the job. (It contains 78 input files - with run lumi information
{
"_id": "684921",
"_rev": "24-ac61b93c0c877f775b3a2973f136e213",
"workflow": "fabozzi_Run2017C-v1-DoubleMuonLowMass-12Sep2017_9211_170922_194400_3624",
"states": {
"0": {
"newstate": "created",
"oldstate": "new",
"location": "Agent",
"timestamp": 1507915619
},
"1": {
"oldstate": "created",
"newstate": "executing",
"location": "T2_UK_London_Brunel",
"timestamp": 1507917105
},
"2": {
"oldstate": "executing",
"newstate": "complete",
"location": "Agent",
"timestamp": 1507918382
},
"3": {
"oldstate": "jobfailed",
"newstate": "jobcooloff",
"location": "Agent",
"timestamp": 1507918528
},
"4": {
"oldstate": "jobcooloff",
"newstate": "created",
"location": "Agent",
"timestamp": 1507918658
},
"5": {
"oldstate": "created",
"newstate": "executing",
"location": "T2_UK_London_Brunel",
"timestamp": 1507919736
},
"6": {
"oldstate": "executing",
"newstate": "complete",
"location": "Agent",
"timestamp": 1507922784
},
"7": {
"oldstate": "complete",
"newstate": "jobfailed",
"location": "Agent",
"timestamp": 1507922862
},
"8": {
"oldstate": "jobfailed",
"newstate": "jobcooloff",
"location": "Agent",
"timestamp": 1507922907
},
"9": {
"oldstate": "jobcooloff",
"newstate": "created",
"location": "Agent",
"timestamp": 1507928085
},
"10": {
"oldstate": "created",
"newstate": "executing",
"location": "T2_UK_London_Brunel",
"timestamp": 1507928482
},
"11": {
"oldstate": "executing",
"newstate": "complete",
"location": "Agent",
"timestamp": 1507929436
},
"12": {
"oldstate": "jobcooloff",
"newstate": "created",
"location": "Agent",
"timestamp": 1507949842
},
"13": {
"oldstate": "created",
"newstate": "executing",
"location": "T2_UK_London_Brunel",
"timestamp": 1507951456
},
"14": {
"oldstate": "executing",
"newstate": "complete",
"location": "Agent",
"timestamp": 1507952186
},
"15": {
"oldstate": "complete",
"newstate": "jobfailed",
"location": "Agent",
"timestamp": 1507952240
},
"16": {
"oldstate": "jobfailed",
"newstate": "jobcooloff",
"location": "Agent",
"timestamp": 1507952331
},
"17": {
"oldstate": "jobcooloff",
"newstate": "created",
"location": "Agent",
"timestamp": 1507997457
},
"18": {
"oldstate": "created",
"newstate": "executing",
"location": "T2_UK_London_Brunel",
"timestamp": 1507999949
},
"19": {
"oldstate": "executing",
"newstate": "complete",
"location": "Agent",
"timestamp": 1508001068
},
"20": {
"oldstate": "complete",
"newstate": "jobfailed",
"location": "Agent",
"timestamp": 1508001301
},
"21": {
"oldstate": "jobfailed",
"newstate": "retrydone",
"location": "Agent",
"timestamp": 1508001460
},
"22": {
"oldstate": "retrydone",
"newstate": "exhausted",
"location": "Agent",
"timestamp": 1508001461
},
"23": {
"oldstate": "exhausted",
"newstate": "cleanout",
"location": "Agent",
"timestamp": 1508002420
}
},
"user": "fabozzi",
"jobgroup": 56778,
"taskType": "reprocessing",
"owner": "fabozzi",
"inputfiles": [
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/420F5448-05B0-E711-B692-D4AE526A10E8.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 978912,
"size": 0,
"runs": [
{
"run_number": 299478,
"lumis": [
140
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/9EE6A685-05B0-E711-88B2-1866DA879EDE.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 969250,
"size": 0,
"runs": [
{
"run_number": 299478,
"lumis": [
166
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/1C2A5BA3-08B0-E711-8A3C-1866DA87EE25.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 969265,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
83
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/220A0342-09B0-E711-9D9F-1866DA87D585.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 969270,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
134
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/6E813BF1-17B0-E711-BFCE-1866DA87A65E.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 978917,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
191
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/2C3D69DB-17B0-E711-9D61-0CC47A0093EC.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 978926,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
203
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/BE305B36-18B0-E711-9BCB-0CC47A7EEE76.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 975268,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
217
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/64222FE7-19B0-E711-A5C8-1866DA8908C7.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981687,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
263
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/AE28B499-19B0-E711-AE47-0CC47A7EEE1E.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 978931,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
272
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/7017C6A4-19B0-E711-809A-0CC47A00A814.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 984404,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
286
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A6CAB59E-19B0-E711-931A-1866DA852F52.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980316,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
295
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/8C9F270A-1DB0-E711-A03A-1866DA87967B.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983252,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
399
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/6237F91A-1DB0-E711-8401-1866DA89035E.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980326,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
408
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A2C3582B-1DB0-E711-A28B-0CC47A01CAEA.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980331,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
435
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/4063A32A-1DB0-E711-8039-0CC47A7EEC70.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 978951,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
444
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/56AEE937-1DB0-E711-967E-1866DA87931C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981702,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
452
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/5AB9EF18-1DB0-E711-9434-0CC47A7EEE80.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 976500,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
468
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/46425C2B-1FB0-E711-A61A-0CC47A00941C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983262,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
555
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/4481866D-20B0-E711-AB54-0CC47A7EEE96.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980341,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
606
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A6EC1E99-20B0-E711-98BA-0CC47A7EEEA0.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980346,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
628
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/78EFF597-20B0-E711-9A27-0CC47A00934A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981712,
"size": 0,
"runs": [
{
"run_number": 299480,
"lumis": [
648
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/1C7205ED-0DB0-E711-A472-D4AE526A0C7A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 969285,
"size": 0,
"runs": [
{
"run_number": 299481,
"lumis": [
12
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00000/244EE0A4-0FB0-E711-9BC5-1866DA890A7A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980237,
"size": 0,
"runs": [
{
"run_number": 299592,
"lumis": [
63
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00000/DA58EBE2-0FB0-E711-9C40-1866DA852F52.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 973960,
"size": 0,
"runs": [
{
"run_number": 299593,
"lumis": [
252
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00000/709F8A3E-10B0-E711-B13C-0CC47A7EEDB0.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 973965,
"size": 0,
"runs": [
{
"run_number": 299593,
"lumis": [
293
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/10AD4897-23B0-E711-9D50-D4AE526A0CE0.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 986502,
"size": 0,
"runs": [
{
"run_number": 300122,
"lumis": [
570
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/A6C71910-24B0-E711-8487-1866DA87EE25.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983197,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
60
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/5E0BB816-24B0-E711-8C58-0CC47A7EEE80.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980296,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
166
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/34A99D8A-24B0-E711-AF6C-1866DA87AFB4.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983217,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
209
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/EAA5204A-25B0-E711-B438-0CC47A0092D0.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 984359,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
410
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/18F52C3E-25B0-E711-9586-0CC47A7EED28.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980301,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
415
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/22A4B5A6-25B0-E711-AE73-0CC47A7EEE92.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980311,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
471
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/E8F0EE68-26B0-E711-8C6F-0CC47A7EEE1E.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981667,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
558
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/0647BDCF-27B0-E711-8A96-0CC47A009148.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981677,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
645
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/C68F2176-28B0-E711-9F57-0CC47A7EEC70.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981682,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
848
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/04E8479B-28B0-E711-A3FC-0CC47A00AA80.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 984394,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
889
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00002/62BEE194-29B0-E711-87DB-0CC47A0091C6.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983242,
"size": 0,
"runs": [
{
"run_number": 300157,
"lumis": [
945
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/FE6717A3-0EB0-E711-A4CB-D4AE526A05F2.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981757,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
68
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/0AEFFEB9-0EB0-E711-98AF-1866DA87A864.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980367,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
74
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/3413D0E0-0EB0-E711-9018-1866DA87AC15.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983303,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
114
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/8EF12D04-0FB0-E711-82EC-1866DA87EE25.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 977689,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
121
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/86512A14-0FB0-E711-93D6-1866DA890A7A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 977694,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
126
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/C2D75E2A-11B0-E711-8D5D-0CC47A009E10.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 986533,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
133
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/C420BC1D-11B0-E711-94C4-0CC47A009148.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 976541,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
165
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/4467C0A7-11B0-E711-B6B7-1866DA89061C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 976546,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
212
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/48682D85-12B0-E711-BFC5-1866DA89061C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 977704,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
294
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/B213A4BC-12B0-E711-9E07-D4AE526A05F2.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983308,
"size": 0,
"runs": [
{
"run_number": 300459,
"lumis": [
324
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/BEEF2304-22B0-E711-824A-1866DA87D7BF.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981767,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
276
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/365370FA-12B0-E711-9782-1866DA87AFB4.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 970551,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
286
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/E0F7BE2F-13B0-E711-80F3-0CC47A00A832.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 972054,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
307
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/00BE4B21-22B0-E711-A1CD-1866DA879B33.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983313,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
316
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/867E6813-22B0-E711-A1BA-0CC47A7EEC70.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980382,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
339
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A2EA175B-25B0-E711-8FD4-1866DA87F44A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981777,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
379
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/9AFB4E9E-15B0-E711-A707-1866DA87A664.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 972074,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
550
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/E0293034-16B0-E711-A8C0-0CC47A0109A6.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 972084,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
569
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/64DF8A90-25B0-E711-BBBE-1866DA89044E.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 985524,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
594
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/82427D44-17B0-E711-BD05-0CC47A7EEF1A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 970566,
"size": 0,
"runs": [
{
"run_number": 300466,
"lumis": [
629
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/C4C80BA8-25B0-E711-9A28-0CC47A7EEE32.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981797,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
28
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/B8D2D480-17B0-E711-AE38-0CC47A7EEE92.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 970571,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
58
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/C0ADB386-18B0-E711-8F52-0CC47A7EEE92.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 970576,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
99
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/E0A91FA0-26B0-E711-8A1F-0CC47A00A832.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981802,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
153
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/4A124EE9-18B0-E711-AAD4-1866DA87A870.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974052,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
172
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/96328C77-26B0-E711-9A32-0CC47A7EEE96.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 980387,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
210
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/26580AC1-26B0-E711-86F8-0CC47A009E22.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 984427,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
220
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/202E9572-1AB0-E711-BED8-1866DA86CCDF.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974057,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
235
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/0844AF69-1AB0-E711-BB7E-0CC47A010010.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 973117,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
248
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/9AA424A2-1AB0-E711-8F8B-0CC47A7EED28.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 972089,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
276
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/26388CC0-1AB0-E711-81CC-1866DA87A96C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974067,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
315
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/4EAF5B1F-1BB0-E711-8391-0CC47A0107D0.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974077,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
341
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A052F021-1BB0-E711-82DB-D4AE526A0A7B.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974082,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
369
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/A4314DBA-1AB0-E711-B769-1866DA879B75.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 970581,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
379
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974087,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
396
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/0416B512-1BB0-E711-AB9E-D4AE5269DC07.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981807,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
410
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/DC9DDFC2-26B0-E711-886B-D4AE526A0A39.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981812,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
437
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/DE0432D0-26B0-E711-896F-0CC47A7EEF1A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981817,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
463
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/8244C7B2-1BB0-E711-B8D7-1866DA8903B2.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 974097,
"size": 0,
"runs": [
{
"run_number": 300467,
"lumis": [
493
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/94AF4383-1FB0-E711-8ABB-1866DA87AF6C.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981822,
"size": 0,
"runs": [
{
"run_number": 300516,
"lumis": [
12
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/C06B740F-20B0-E711-AA40-1866DA890268.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 983293,
"size": 0,
"runs": [
{
"run_number": 300516,
"lumis": [
59
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
},
{
"lfn": "/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/DA408546-20B0-E711-A4AD-D4AE526A0C7A.root",
"locations": [
],
"first_event": 0,
"checksums": {
},
"id": 981743,
"size": 0,
"runs": [
{
"run_number": 300517,
"lumis": [
30
]
}
],
"last_event": 0,
"parents": [
],
"events": 0,
"merged": true
}
],
"jobType": "Merge",
"task": "/fabozzi_Run2017C-v1-DoubleMuonLowMass-12Sep2017_9211_170922_194400_3624/DataProcessing/DataProcessingMergeDQMoutput",
"group": "ppd",
"name": "b7ae480e-b03b-11e7-bce9-02163e01877e-0",
"mask": {
"LastRun": null,
"FirstRun": null,
"LastEvent": null,
"FirstEvent": null,
"LastLumi": null,
"FirstLumi": null
},
"jobid": 684921,
"type": "job"
}
http://vocms0250.cern.ch:5984/wmagent_jobdump%2Fjobs/_design/JobDump/_show/jobSummary/684921
And wmbs also has correct data (78 input and 1 output)
SQL> select count(*) from wmbs_job_assoc where job=684921;
79
However, fwjr doesn't have input files.
"cmsRun1": {
"status": 70452,
"logs": {
},
"stop": 1508000434,
"site": "T2_UK_London_Brunel",
"input": {
},
"errors": [
{
"details": "No run/lumi information in file (WMAgent)., file was /store/data/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00000/98FA9531-01B1-E711-85F5-0025904C7E04.root",
"type": "NoRunLumiInformation",
"exitCode": 70452
}
],
"parameters": {
},
"output": {
"analysis": [
],
"Merged": [
{
"branch_hash": "d41d8cd98f00b204e9800998ecf8427e",
"lfn": "/store/data/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00000/98FA9531-01B1-E711-85F5-0025904C7E04.root",
"dataset": {
"applicationName": "cmsRun",
"applicationVersion": "CMSSW_9_2_11",
"processedDataset": "Run2017C-12Sep2017-v1",
"dataTier": "DQMIO",
"primaryDataset": "DoubleMuonLowMass"
},
"checksums": {
"adler32": "749f8434",
"cksum": "2380502578"
},
"guid": "98FA9531-01B1-E711-85F5-0025904C7E04",
"size": 17030,
"acquisitionEra": "Run2017C",
"configURL": "None;;None;;None",
"events": 0,
"merged": false,
"validStatus": "PRODUCTION",
"ouput_module_class": "DQMRootOutputModule",
"globalTag": "92X_dataRun2_2017Repro_v4",
"pfn": "/scratch/dir_25531/glide_SjAjIU/execute/dir_25461/job/WMTaskSpace/cmsRun1/Merged.root",
"catalog": "",
"module_label": "Merged",
"inputPath": null,
"runs": {
},
"processingVer": 1,
"processingStr": "12Sep2017",
"prep_id": ""
}
]
},
fwjr location
/data/srv/wmagent/v1.1.6.patch4/install/wmagent/JobCreator/JobCache/fabozzi_Run2 017C-v1-DoubleMuonLowMass-12Sep2017_9211_170922_194400_3624/DataProcessing/DataP rocessingMergeDQMoutput/JobCollection_56778_0/job_684921/Report.4.pkl
The weird thing is Job retry post fix is 684921 -1, 684921-3, 684921-4 instead of -0, -1, -2.
Also, all this failure is for DataProcessingMergeDQMoutput task. There are 27 successful jobs and 9 failure (all the failure is 70452 error). T2_UK_London_Brunel - 4. T2_UK_SGrid_RALPP -5
Looking at the files, all the locations are missing in the couch document (above documents)
In WMBS, I checked a couple of in put files location, (It doesn't seems to have matching location on above 2 locations though)
SQL> select wl.cms_name from wmbs_file_location wf inner join wmbs_file_details wfd on wfd.id=wf.fileid inner join wmbs_location wl on wl.id=wf.location where wfd.lfn='/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/DA408546-20B0-E711-A4AD-D4AE526A0C7A.root';
T3_UK_London_QMUL T3_UK_SGrid_Oxford T3_UK_London_RHUL T2_UK_London_IC T3_UK_ScotGrid_GLA
SQL> select wl.cms_name from wmbs_file_location wf inner join wmbs_file_details wfd on wfd.id=wf.fileid inner join wmbs_location wl on wl.id=wf.location where wfd.lfn='/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/420F5448-05B0-E711-B692-D4AE526A10E8.root';
T3_UK_London_QMUL T3_UK_SGrid_Oxford T3_UK_London_RHUL T2_UK_London_IC T3_UK_ScotGrid_GLA
Thanks, Seangchan. I have a question about these couch docs though, do they have any effect on a job or they are there only for monitoring/summary purpose? I guess it's the latter.
To complement all information you have already retrieved (especially that wmbs contains the runs/lumis for those files), I managed to fetch the logArchive (cmsRun logs) and the job tarball (with all condor logs and WMA reports). http://amaltaro.web.cern.ch/amaltaro/forWMCore/Issue_8258/
As you can see, we have all 4 Report.pkl files in there. It's worth noticing that the job PSet configuration had all those files to be merged (since it's a merge job, we do not provide run/lumi info): http://amaltaro.web.cern.ch/amaltaro/forWMCore/Issue_8258/cmsRun1/myPSet.py
and this is the CMSSW FJR: http://amaltaro.web.cern.ch/amaltaro/forWMCore/Issue_8258/cmsRun1/FrameworkJobReport.xml
which is clearly missing the input files (Inputs) and the run/lumi information (Runs).
Last but not least, the condor.out shows the Merged.root file under cmsRun1 dir, but it shows no attempt to transfer that file into the merged area(?) Maybe because the FJR is first parsed and then if everything is Ok, it tries the stage out?
I see lots of confusing things, but I still can't explain why those jobs went to T2_UK_London_Brunel
instead of T2_UK_London_IC
.
These were the classads defined for the merge job:
DESIRED_Sites = "T2_UK_London_Brunel"
ExtDESIRED_Sites = "T2_UK_London_Brunel,T3_UK_London_QMUL,T3_UK_ScotGrid_GLA"
DESIRED_CMSDataLocations = "T2_UK_London_IC,T3_UK_London_QMUL,T3_UK_London_RHUL,T3_UK_SGrid_Oxford,T3_UK_ScotGrid_GLA"
and picking one unmerged file that was supposed to be merged here (this is the weirdest part)
SELECT wlpnn.pnn FROM wmbs_file_details wfd INNER JOIN wmbs_file_location wfl ON wfd.id = wfl.fileid
INNER JOIN wmbs_location_pnns wlpnn ON wfl.location = wlpnn.location
WHERE wfd.lfn='/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root'
GROUP BY wlpnn.pnn;
PNN
T2_UK_London_Brunel
T2_UK_London_IC
T2_UK_SGrid_RALPP
which is similar to the query performed in the agent for building the merge jobs: https://github.com/dmwm/WMCore/blob/master/src/python/WMCore/JobSplitting/ParentlessMergeBySize.py#L199
How come a file has several PNNs? Not to mention that these PNNs/PSNs for UK are sort of a mess...
I have also found the processing job that produced this unmerged file, and as we can see it ran at T2_UK_London_IC
===> Stage Out Successful: {'StageOutReport': [], 'LFN': '/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root',
'PFN': 'srm://gfe02.grid.hep.ph.ic.ac.uk:8443/srm/managerv2?SFN=/pnfs/hep.ph.ic.ac.uk/data/cms/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root',
'PNN': 'T2_UK_London_IC', 'Checksums': {'adler32': '9d483b93', 'cksum': '2043391182'}, 'StageOutCommand': 'gfal2'}
full logtarball can be found at the same place as the others:
Second weird fact is that I cannot find any of these files in dbsbuffer_files table ?!?!
Last but not least, JobAccountant retrieves a single PNN from the job report: https://github.com/dmwm/WMCore/blob/master/src/python/WMComponent/JobAccountant/AccountantWorker.py#L812
Even if London_IC was in drain or down status, we should still see it figuring in the ExtDESIRED_Sites ad...
Ok, and this is the final problem. The ACDC doc also has those 3 PNNs for that unmerged file:
"/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root":{"lfn":"/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root","locations":["T2_UK_London_IC","T2_UK_London_Brunel","T2_UK_SGrid_RALPP"]
now we need to find out why. That means, the ACDC might fail as well...
Pissed, but at the same time relieved. After wasting several hours debugging JobAccountant and JobCreator, it turns out the problem is really on the following sites setup:
>>> from WMCore.Services.SiteDB.SiteDB import SiteDBJSON
>>> psn_pnns = SiteDBJSON().PSNtoPNNMap()
>>> pprint(psn_pnns)
...
u'T3_UK_London_QMUL': set([u'T2_UK_London_Brunel',
u'T2_UK_London_IC',
u'T2_UK_SGrid_RALPP']),
u'T3_UK_London_RHUL': set([u'T2_UK_London_IC', u'T2_UK_SGrid_RALPP']),
u'T3_UK_SGrid_Oxford': set([u'T2_UK_London_IC', u'T2_UK_SGrid_RALPP']),
u'T3_UK_ScotGrid_GLA': set([u'T2_UK_London_Brunel',
u'T2_UK_London_IC',
u'T2_UK_SGrid_RALPP']),
...
where in plain english, it means that that unmerged file was staged to T2_UK_London_IC
PNN, which maps to some PSNs that maps back to 3 different PNNs. We need to contact the SS team, AFAIK one PSN cannot have more than one PNN!!!
Soo, getting the list of PSNs for that unmerged file (everything looks Ok):
SELECT wfd.id AS file_id, wfd.lfn AS file_lfn, wl.site_name FROM wmbs_file_details wfd INNER JOIN wmbs_file_location wfl ON wfd.id = wfl.fileid
INNER JOIN wmbs_location_pnns wlpnn ON wfl.location = wlpnn.location
INNER JOIN wmbs_location wl ON wl.id = wlpnn.location
WHERE wfd.lfn='/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root'
GROUP BY wfd.id, wfd.lfn, wl.site_name;
file_id file_lfn site_name
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T3_UK_SGrid_Oxford
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T3_UK_London_QMUL
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T3_UK_London_RHUL
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T2_UK_London_IC
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T3_UK_ScotGrid_GLA
while getting their mapping back to PNN (which is called here) https://github.com/dmwm/WMCore/blob/master/src/python/WMCore/JobSplitting/ParentlessMergeBySize.py#L199
SELECT wfd.id AS file_id, wfd.lfn AS file_lfn, wlpnn.pnn FROM wmbs_file_details wfd INNER JOIN wmbs_file_location wfl ON wfd.id = wfl.fileid
INNER JOIN wmbs_location_pnns wlpnn ON wfl.location = wlpnn.location
WHERE wfd.lfn='/store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root'
GROUP BY wfd.id, wfd.lfn, wlpnn.pnn;
file_id file_lfn pnn
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T2_UK_London_Brunel
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T2_UK_SGrid_RALPP
974087 /store/unmerged/Run2017C/DoubleMuonLowMass/DQMIO/12Sep2017-v1/00001/CAEF3FED-1AB0-E711-A5BA-0CC47A009258.root T2_UK_London_IC
One PSN can have as many PNN as you want, as long as jobs running at the PSN correctly record where they staged their output to. Also, a PNN can be mapped to multiple PSN as long as each PSN can read files from that PNN.
The only real requirement for PSN/PNN mappings are that jobs sent to a PSN need to be able to read files from all mapped PNN and that they correctly record what PNN they staged their output to.
This means though that the agent cannot go through a PNN->PSN->PNN translation and expect the output PNN to be identical to the input PNN.
That being said, the PSN with multiple PNN use case is certainly unusual and it's worth checking if this is really correct. The PNN mapped to multiple PSN use case is not that unusual though.
I couldn't find the original definition/design of PSN x PNN. But
One PSN can have as many PNN as you want, as long as jobs running at the PSN correctly record where they staged their output to.
I thought it wasn't allowed, and probably these T3 sites are the first ones to use this feature.
Also, a PNN can be mapped to multiple PSN as long as each PSN can read files from that PNN.
yes, this makes sense. This is the classical case for opportunistic resources.
Just to avoid further problems, I think we should not add T3_* resources to the standard production agents. And actually remove those 2 T3 sites from all the production agents.
Then, second step is to make sure we load the proper PNNs in the job splitting factory.
In general that's wrong. We need to be able to run at T3 sites, also for production. For now removing the problematic sites until we had a chance to follow up via SiteSupport if the setup is correct is a reasonable workaround though.
As I said, the multiple PNN for a single PSN use case is unusual. I still think we should support it (and that very well might require debugging/fixing code that can't deal with this right now), but it's worth checking if this is really correct in this case.
Getting back to this, I think removing those problematic sites from wmbs_location might cause more harm than good. Since files will have a location that no longer exists in the database, that might just crash something somewhere else. Unless we also update wmbs_file_location table by deleting all entries for those sites...
About the real fix, instead of setting file locations as a location from wmbs_location
table (id column), we should instead set it to the actual pnn from wmbs_location_pnns
(pnn column).
@ticoann would you have any insight on this issue? I'll start working on the pnn vs location id fix.
About the short-term fix, we need to (of course, with components down):
('T3_UK_London_QMUL', 'T3_UK_ScotGrid_GLA')
from wmbs_locationthen on the DBSBuffer tables:
@ticoann @hufnagel am I missing something? A conservative approach would be to live with this problem until we deploy new agents (and don't add T3s to resource control). Let me know please.
As I said, indiscriminately not adding T3 to resource control is not an option. Make a list of "bad" sites for now that we can't handle, don't just say "we do not run at T3".
On your proposal, as long as all files you do this for have other valid locations this should work. As you said before, if you end up with a file with no location, you could be in a real mess since that likely triggers other problems in the code that doesn't expect this. Best case I would expect these files to remain zombies forever, worst case they crash some other code and we have to completely remove the files and all their records (which would be extremely tricky if they are assigned to jobs already).
Also, isn't this only relevant for files not assigned to jobs yet ? What about files that are assigned to jobs ? Are these jobs fixable or do we need to fail them ?
Dirk, I'm not banning T3's from production until the end of the days. But if we have to ban T3s until it gets properly fixed, I'd happily do so. We anyways don't rely on T3 for any serious production campaign unless when opportunistic resources have been properly commissioned and are in use the Unified.
@ticoann I forgot chatting with you about this issue, can you please have a look at my last 2 or 3 comments and let me know if I'm missing anything (or if I should not proceed with the DB surgery)?
T3_US_OSG is in Unified and in serious use. Don't ban them all, ban only the one the ones you need to.
@amaltaro can you please summarize how the complex UK pnn/psn mapping leads to the NoRunLumiInformation error ?
removing T3 is not an option indeed.
Disclaimer: examples below with a different file from different workflow, the previous one has been archived.
This issue is twisting my mind, but let me try to summarise the problem.
When a file is created in WMBS (either from a job or from WMBSHelper), we use its PNN in order to get their location (PSN), from wmbs_location_pnns
. Problem is, that location (PSN) is one site entry in wmbs_location
table, which can have several PNNs associated to it. Since we use this location - later on - to get the list of PNNs, we retrieve more PNNs than it should.
Another example of file/location registration in wmbs (from what I've just explained):
select * from wmbs_file_location where fileid=(select id from wmbs_file_details where lfn='/store/unmerged/RunIISummer15wmLHEGS/DYJetsToLL_M-105To160_VBFFilter_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/LHE/VBFPostMGFilter_MCRUN2_71_V1-v1/00001/38CE6565-91B3-E711-97DB-1866DA87AC15.root');
fileid location
1333934 6
1333934 51
1333934 68
1333934 72
1333934 75
now with these file locations, if we want to check what are the PNNs where this file is available:
SELECT wl.site_name, wlpnn.pnn FROM wmbs_file_details wfd
INNER JOIN wmbs_file_location wfl ON wfd.id = wfl.fileid
INNER JOIN wmbs_location wl ON wfl.location = wl.id
INNER JOIN wmbs_location_pnns wlpnn ON wl.id = wlpnn.location
WHERE wfd.lfn='/store/unmerged/RunIISummer15wmLHEGS/DYJetsToLL_M-105To160_VBFFilter_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/LHE/VBFPostMGFilter_MCRUN2_71_V1-v1/00001/38CE6565-91B3-E711-97DB-1866DA87AC15.root'
GROUP BY wl.site_name, wlpnn.pnn;
T3_UK_London_RHUL T2_UK_SGrid_RALPP
T3_UK_London_QMUL T2_UK_SGrid_RALPP
T3_UK_ScotGrid_GLA T2_UK_London_Brunel
T3_UK_London_QMUL T2_UK_London_Brunel
T2_UK_London_IC T2_UK_London_IC
T3_UK_SGrid_Oxford T2_UK_SGrid_RALPP
T3_UK_ScotGrid_GLA T2_UK_London_IC
T3_UK_ScotGrid_GLA T2_UK_SGrid_RALPP
T3_UK_London_RHUL T2_UK_London_IC
which is probably wrong. It could be that this file is available only at T2_UK_London_IC
, but this mapping back and forth just brings all the PNNs a PSN is connected to.
I think one of the ways to solve it would be by specifically adding the PNN name to the wmbs_location (instead of the location/PSN). Drawback is that an integer location will become a string column.
Second possibility would be to create an index for wmbs_location_pnns table and use that index as location.
@ticoann @hufnagel let me know if you have any preference. I'm starting to work on the first one right now.
The underlying problem is that WMAgent historically had a single location concept that was tied to a site. Which is wrong since location for job submission and location for data are very different things. It just looked like they were the same in the old grid model where you had cpu and storage always co-located and completely tied to each other. As these old grid concepts break down and people try new concepts, we are having trouble.
We have reworked this once already (and it did improve), but IMO this didn't go far enough.
You have an identifier for a site where you sent jobs to. You have an identifier for a storage system where data is recorded. The code should not make ANY assumptions about how they are related to each other (and yes, storing them in the same table or setting them up in the same command makes assumptions). You have a mapping in an association table that ties them together, that's all.
wmbs_location should not contain any pnn. There should be a separate pnn table and an assoc table that ties the location id to the pnn id it maps to. File location is then recorded with the pnn id and not the location id (since files don't have such a location, it doesn't exist, they only have pnn).
That's the clean way.
Something like this:
"""CREATE TABLE wmbs_file_location (
fileid INTEGER NOT NULL,
pnn INTEGER NOT NULL,
PRIMARY KEY(fileid, pnn),
FOREIGN KEY(fileid) REFERENCES wmbs_file_details(id)
ON DELETE CASCADE,
FOREIGN KEY(pnn) REFERENCES wmbs_location_pnn(id)
ON DELETE CASCADE)"""
"""CREATE TABLE wmbs_location_pnn (
id INTEGER,
pnn VARCHAR(255),
PRIMARY KEY(id)
)"""
"""CREATE TABLE wmbs_location_pnn_assoc (
location INTEGER NOT NULL,
pnn INTEGER NOT NULL,
PRIMARY KEY(location, pnn),
FOREIGN KEY(location) REFERENCES wmbs_location(id)
ON DELETE CASCADE,
FOREIGN KEY(pnn) REFERENCES wmbs_location_pnn(id)
ON DELETE CASCADE)"""
You see that wmbs_file_location has no direct relationship to wmbs_location anymore, nor should it.
I'm working along these lines, but instead of storing an id and using a diff *_assoc table, I'll simply register the files location as PNNs (string, not the id). This reduces one join in the queries, but the data volume is slightly bigger. Do you think there might be performance penalties with this change? It looks the cleanest IMO.
It's bad design. In functional terms it basically blows up the wmbs_location table since you have to store a copy of it for every PNN that location maps to. At that point you rely on code for your db integrity (since only code makes sure these multiple records are identical).
Also, it's a hacky workaround, we don't really have so many locations. You basically play tricks in the schema to work around code problems, which is always a bad idea. A database driven project like ours it's always the smart move to have a clean data model that represents the actual reality of what you have to work with. It's not like this location/pnn hybrid has any real meaning.
You can't store multiple PNN in the same wmbs_location record since then you have the same problem. File maps to location which maps to multiple PNN, only one of which the file is actually located at.
I guess you just misunderstood me. I'll create the PR tomorrow or on Monday and we can follow up from there.
Here's the document we based the PNN transition on: https://www.evernote.com/l/ADAAbtE6c71KJayF-unaJyjbKxahqdupXlE The point was definitely to have multiple PSNs per PNN.
Thank you very much, Eric. I spent 15min scanning my emails looking for it. Haaa, and this is what I was looking for:
But this will be a 1:N mapping only, with a single data location having 1 or more processing locations. We make the assumption here that a single site (processing location) has one unified storage system. This simplifies the problem significantly and is true for all of our current resources (after the tape/disk seperation). Non-unified storage systems could still be supported, but we would need to create multiple processing locations for them, each with access to their own unified storage systems.
I don't need to rephrase it saying that the original design did NOT expect multiple PNNs for 1 PSN.
Eric, Dirk, please confirm whether we have to support N:N mapping. If not, then those resources have to be fixed up in SiteDB and Site Support team needs to be informed that that is not supported. I'm happy to change this PR such that we support it in WMAgent, just don't want to waste time before we have a decision.
Note that I think we've always had the concept of a close SE or PNN and that shouldn't change. In other words, when a job at a PSN writes data, it goes to one PNN. However, a PSN should be able to be configured to read data from multiple PNNs.
Yes, one file goes to one single PNN. Ok, gotcha! I'll update this PR to support N:N mapping.
To be investigated once https://github.com/dmwm/WMCore/issues/8220 is sorted out. From JIRA ticket: https://its.cern.ch/jira/browse/CMSCOMPPR-1552
and from wmstats: