Merge pull request #224 from cernopendata/pr-214

cms-derived-data: add PFNano, updates other parts
cernopendata · Mar 1, 2024 · 9f4633b · 9f4633b
2 parents 0566889 + c4f1e4f
commit 9f4633b
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 52 deletions.
diff --git a/cms-derived-data/config.yaml b/cms-derived-data/config.yaml
@@ -18,21 +18,19 @@ common_values:
 
 NanoAODRun1:
   recid_start: 40000
-  title: <dataset> dataset in Run1 NanoAOD-like format
+  title: <dataset-title> dataset in Run1 NanoAOD-like format
   abstract:
     description: >
-      <p><dataset> dataset in a NanoAOD-like research-level Ntuple format for CMS Run1 data, readable with bare
-      <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software, and containing the per-event information that is needed in most generic analyses. 
-      In contrast to the CMS NanoAOD format which is derived from MiniAOD, it is generated directly from the AOD format with completely 
-      independent code provided by the CMS open data group. Nevertheless, there is a large overlap in functionality and content between 
-      NanoAODRun1 and NanoAOD such that common analyses are possible.</p>
-      <p>The dataset is provided as a collection of root files and in <dataset>_merged.root with the separate files merged into one file. 
+      <p><dataset-title> dataset in a NanoAOD-like research-level Ntuple format for CMS Run1 data, readable with bare
+      <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software, and containing the per-event information that is needed in most generic analyses.
+      In contrast to the CMS NanoAOD format which is derived from MiniAOD, it is generated directly from the AOD format with code provided by the CMS open data group. Nevertheless, there is a large overlap in functionality and content between NanoAODRun1 and NanoAOD such that common analyses are possible.</p>
+      <p>The dataset is provided as a collection of root files. <dataset>_merged.root contains the separate files merged into one file.
       This dataset was processsed from the primary dataset in AOD format linked below.</p>
   distribution:
     formats:
       - nanoaod-run1
   methodology:
-    description: The datasets were produced with the software available in the record linked below.
+    description: The dataset was produced with the software available in the record linked below.
     links:
       - recid: "12505" 
   use_with:
@@ -45,20 +43,17 @@ NanoAODRun1:
 
 POET:
   recid_start: 50000
-  title: <dataset> dataset in reduced NanoAOD-like format
+  title: <dataset-title> dataset in reduced NanoAOD-like format
   abstract:
     description: >
-      <p>This dataset contains information extracted from different physics objects from the 2015 MiniAOD parent <parent_dataset> dataset, 
-      readable with bare <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software. It was produced for the CMS open data workshop tutorials.</p>
-      <p>It is provided in two different structures: a collection of root files as a direct output of POET 
-      (separate trees for each object), and "_flat.root" files "flattened" to a single tree, as required when used as input to coffea with 
-      nanoevents schema.  <dataset>_flat.root has the separate "_flat.root" files merged into one file. 
+      <p>This dataset contains information extracted from different physics objects from the 2015 MiniAOD <parent_dataset> dataset, readable with bare <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software. It is part of the datasets produced for the CMS open data workshop tutorials, and not all events in the parent dataset were necessarily processed.</p>
+      <p>It is provided in two different structures: a collection of root files as a direct output of POET (separate trees for each object), and "_flat.root" files "flattened" to a single tree, as required when used in columnar analysis with e.g. <a href="https://awkward-array.org/doc/main/">awkward array</a> and <a href="https://uproot.readthedocs.io/en/latest/index.html">uproot</a>. <dataset>_flat.root has the separate "_flat.root" files merged into one file.
       This dataset was derived from the primary dataset in MiniAOD format linked below.</p>
   distribution:
     formats:
       - nanoaod-poet
   methodology:
-    description: The datasets were produced with the software available in the record linked below.
+    description: The dataset was produced with the software available in the record linked below.
     links:
       - recid: "12502"
   use_with:
@@ -67,22 +62,20 @@ POET:
       - url: https://cms-opendata-workshop.github.io/workshop2022-lesson-ttbarljetsanalysis/
   validation:
     description: >
-      <p>These data were processed from the primary dataset using only the validated runs. 
-       No further validation was done for the output.</p>
+      <p>These data were processed from the primary dataset using only the validated runs. No further validation was done for the output.</p>
 
 PFNano:
   recid_start: 60000
-  title: <dataset> dataset in NanoAOD format enhanced with Particle Flow candidates
+  title: <dataset-title> dataset in NanoAOD format enhanced with Particle Flow candidates from RunG of 2016
   abstract:
     description: >
-      <p><dataset> dataset in NanoAOD format enhanced with Particle Flow candidates, readable with bare ROOT or 
-      other ROOT-compatible software. In addition to the default NanoAOD content, it contains the information. This dataset was derived from 
-      the primary dataset in MiniAOD format linked below.</p>
+      <p><dataset> dataset in NanoAOD format enhanced with Particle Flow candidates, readable with bare ROOT or other ROOT-compatible software. In addition to the default NanoAOD content, it contains the list of Particle Flow candidates. This dataset was derived from the primary dataset in MiniAOD format linked below.</p>
+      <p>The list of validated runs, which must be applied to all analyses, either with the full validation or for an analysis requiring only muons, can be found in:</p>
   distribution:
     formats:
       - nanoaod-pf
   methodology:
-    description: The datasets were produced with the software available in the record linked below.
+    description: The dataset was produced with the software available in the record linked below.
     links:
       - recid: "12504"
   use_with:
@@ -91,5 +84,5 @@ PFNano:
       - recid: "12361"
   validation:
     description: >
-      <p>These data were processed from the primary dataset. The processed runs and lumi sections are in</p>
+      <p>These data were processed from the MiniAOD primary dataset. If not equal to the parent, the processed runs and lumi sections are available below.</p>
 ...
diff --git a/cms-derived-data/create_cms_derived_datasets.py b/cms-derived-data/create_cms_derived_datasets.py
@@ -22,15 +22,15 @@ def get_number_events(file_path, data_type):
     """Return number of events in root file."""
     myfile = ROOT.TFile.Open(file_path)
     number_events = 0
-    if data_type == "NanoAODRun1":
+    if data_type == "NanoAODRun1" or data_type == "PFNano":
         number_events = myfile.Events.GetEntries()
     elif data_type == "POET":
         number_events = myfile.events.GetEntries()
     return number_events
 
 
 def get_parent_recid(parent_title):
-    """Return parent dataset recid."""
+    """Return parent dataset recid."""   
     cmd = f"cernopendata-client get-metadata --title={parent_title} --output-value recid"
     recid = subprocess.getoutput(cmd)
     return recid
@@ -42,7 +42,7 @@ def get_file_size(file_path):
     return math.ceil(file_size)
 
 def get_collision_information(parent_title):
-    """Return collision information."""
+    """Return collision information.""" 
     cmd = f"cernopendata-client get-metadata --title={parent_title} --output-value collision_information"
     collision_information = subprocess.getoutput(cmd)
     return collision_information
@@ -75,7 +75,7 @@ def get_files(dataset_location):
 def get_dataset_semantics_doc(dataset_name, sample_file_path, data_type):
     """Return the paths to the html doc and json doc of the given dataset."""
     output_dir = "outputs/docs"
-    script = "documentation.py" # for NanoAODRun1
+    script = "documentation.py" # for NanoAODRun1 
     if data_type == "PFNano":
         script = "inspectNanoFile.py"
 
@@ -99,8 +99,14 @@ def create_record(metadata, data_type):
     rec = {}
 
     rec["abstract"] = {}
-    rec["abstract"]["description"] = config[data_type]["abstract"]["description"].replace("<dataset>", metadata["dataset"]).replace("<parent_dataset>", metadata["parent"])
-
+    rec["abstract"]["description"] = config[data_type]["abstract"]["description"].replace("<dataset-title>", metadata["dataset-title"]).replace("<dataset>", metadata["dataset"]).replace("<parent_dataset>", metadata["parent"])
+    if data_type == "PFNano":
+        rec["abstract"]["links"] = []
+        for i in metadata["valid_recids"]:
+            rec["abstract"]["links"].append({
+                "recid": str(i)
+            })
+
     rec["accelerator"] = config["common_values"]["accelerator"]
 
     rec["authors"] = []
@@ -110,14 +116,7 @@ def create_record(metadata, data_type):
 
     rec["collision_information"] = json.loads(metadata["collision_information"])
     rec["collections"] = config["common_values"]["collections"]
-
-    #rec["dataset_semantics"] = config["common_values"]["dataset_semantics"]
-    #rec["dataset_semantics_files"] = []
-    #dataset_semantics_files = [ metadata["dataset_semantics_files"]["html_doc"],metadata["dataset_semantics_files"]["json_doc"] ]
-    #if data_type == "NanoAODRun1":
-    #    rec["dataset_semantics"].append({
-    #        "files": dataset_semantics_files
-    #    })
+
     if data_type != "POET":
         rec["dataset_semantics_files"] = {}
         rec["dataset_semantics_files"]["html_doc"] =  metadata["dataset_semantics_files"]["html_doc"].rsplit('/',1)[1]
@@ -126,8 +125,8 @@ def create_record(metadata, data_type):
     rec["date_published"] = config["common_values"]["date_published"]
 
     rec["distribution"] = {}
-    # changes format to nanoaodsim-NNN for MC - relies on having only one format (or nanoaod-NNN as the first)
-    if "Run201" not in metadata["dataset"]:
+    # changes format to nanoaodsim-NNN for MC - to be modified for PFNano sim if we make some (PFNano names dataset names do not have Run2016 in them...)
+    if "Run201" not in metadata["dataset"] and data_type != "PFNano":
         substr = "nanoaod"
         repl = "nanoaodsim"
         config[data_type]["distribution"]["formats"][0] = config[data_type]["distribution"]["formats"][0].replace(substr,repl)
@@ -164,18 +163,19 @@ def create_record(metadata, data_type):
         "type": "isChildOf"
     })
 
-    rec["title"] =  config[data_type]["title"].replace("<dataset>", metadata["dataset"])
+    rec["title"] =  config[data_type]["title"].replace("<dataset-title>", metadata["dataset-title"])
 
     rec["type"] = config["common_values"]["type"]
 
     rec["use_with"] = config[data_type]["use_with"]
 
     rec["validation"] = {}
     rec["validation"]["description"] = config[data_type]["validation"]["description"]
-    # rec["validation"]["links"] = []
-    # rec["validation"]["links"].append({
-    #     "recid": str(metadata["parent_recid"])
-    # })
+    if data_type == "PFNano":
+        rec["validation"]["links"] = []
+        rec["validation"]["links"].append({
+                "url": "link to processedLumis.json"
+            })
 
     return rec
 
@@ -191,7 +191,7 @@ def print_records(records):
 @click.option('--data-type',
               '-t',
               required=True,
-              help='Data Type (NanoAODRun1, POET)')
+              help='Data Type (NanoAODRun1, POET, PFNano)')
 def main(data_type):
     "Do the job."
 
@@ -207,6 +207,12 @@ def main(data_type):
     elif data_type == "POET":
         date = "23-Jul-22"
         recid_start = config["POET"]["recid_start"]
+    elif data_type == "PFNano":
+        date = "29-Feb-24"
+        recid_start = config["PFNano"]["recid_start"]
+        parent_recid = 30500
+        valid_recids = [14220,14221]
+        process_path = "Run2016G-UL2016_MiniAODv2_PFNanoAODv1"
 
     records = []
     datasets_path = f"/eos/opendata/cms/derived-data/{data_type}/{date}"
@@ -217,10 +223,15 @@ def main(data_type):
             continue
 
         dataset_dir_path = f"{datasets_path}/{dataset}"
-        dataset_files = os.listdir(dataset_dir_path)
-
         metadata_yaml_file = open(f"{dataset_dir_path}/metadata.yaml", 'r')
         metadata = yaml.safe_load(metadata_yaml_file)
+
+        if data_type == "PFNano":
+            dataset_dir_path = f"{dataset_dir_path}/{process_path}"
+            next = os.listdir(dataset_dir_path)[0]
+            dataset_dir_path = f"{dataset_dir_path}/{next}/0000"
+
+        dataset_files = os.listdir(dataset_dir_path)
 
         files = get_files(dataset_dir_path)
         number_events = 0
@@ -246,6 +257,7 @@ def main(data_type):
                     size += file_size
             dataset_all_flattened_file_path = dataset_dir_path + "_flat.root"
             files.extend(get_files(dataset_all_flattened_file_path))
+            number_files += 1 # for _flat.root in POET 
         # NanoAODRun1 datasets
         elif data_type == "NanoAODRun1":
             # for all root files under <dataset> directory
@@ -260,12 +272,37 @@ def main(data_type):
             files.extend(get_files(dataset_all_merged_path))  # adds the merged file to the list of dataset files
             sample_file_path = f"{dataset_dir_path}/{dataset_files[0]}"
             metadata["dataset_semantics_files"] = get_dataset_semantics_doc(dataset, sample_file_path, data_type)
+            number_files += 1 # for _merged.root in NanoAODRun1
+        elif data_type == "PFNano":
+            for file in dataset_files:
+                if file.endswith("root"):
+                    file_path = f"{dataset_dir_path}/{file}"
+                    number_events += get_number_events(file_path, data_type)
+                    number_files += 1
+                    file_size = get_file_size(file_path)
+                    size += file_size
+            files.extend(get_files(dataset_dir_path))
+            sample_file_path = f"{dataset_dir_path}/{dataset_files[0]}"
+            metadata["dataset_semantics_files"] = get_dataset_semantics_doc(dataset, sample_file_path, data_type)
 
-        number_files += 1 # for _flat.root in POET and _merged.root in NanoAODRun1
-
-        # prepare metadata for creating the record
-        metadata["parent_recid"] = get_parent_recid(metadata["parent"])
-        metadata["collision_information"] = get_collision_information(metadata["parent"])
+        # prepare metadata for creating the record, for PFNano differently as cernopendata-client does not reach datasets that are not yet released
+        if data_type == "PFNano":
+            metadata["parent_recid"] = str(parent_recid)
+            parent_recid += 1 # this assumes loop in aplhabetical order
+            metadata["collision_information"] = '{"energy": "13TeV","type": "pp"}'
+            metadata["valid_recids"] = []
+            metadata["valid_recids"] = valid_recids
+        else:
+            metadata["parent_recid"] = get_parent_recid(metadata["parent"])
+            metadata["collision_information"] = get_collision_information(metadata["parent"])
+
+        # For MC, remove the processing string from the name for the title
+        metadata["dataset-title"] = metadata["dataset"]
+        if "Run201" not in metadata["dataset"]:
+            if data_type == "NanoAODRun1":
+                metadata["dataset-title"] = metadata["dataset"].split('DR',1)[1].split('_',1)[1]
+            elif data_type == "POET":
+                metadata["dataset-title"] = metadata["dataset"].split('_',1)[1]
         metadata["number_events"] = number_events
         metadata["number_files"] = number_files
         metadata["size"] = size