Merge pull request #223 from cernopendata/pr-214

cms-derived-data: updates to the script
cernopendata · Feb 29, 2024 · 0566889 · 0566889
2 parents ed913b1 + cfac89f
commit 0566889
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 35 deletions.
diff --git a/cms-derived-data/config.yaml b/cms-derived-data/config.yaml
@@ -2,12 +2,11 @@
 # all recid_start are placeholders, should be changed later to correct ones
 common_values:
   accelerator: CERN-LHC
-  authors: 
-    - CMS Open Data Group
+  authors: CMS Open Data group
   collections:
     - CMS-Derived-Datasets
   dataset_semantics:
-  date_published: 2023
+  date_published: "2024"
   experiment: CMS
   license:
     attribution: CC0
@@ -26,49 +25,49 @@ NanoAODRun1:
       <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software, and containing the per-event information that is needed in most generic analyses. 
       In contrast to the CMS NanoAOD format which is derived from MiniAOD, it is generated directly from the AOD format with completely 
       independent code provided by the CMS open data group. Nevertheless, there is a large overlap in functionality and content between 
-      NanoAODRun1 and NanoAOD such that common analyses are possible. It is provided as a collection of root files under <dataset> directory, 
-      and in <dataset>_merged.root with the separate files in the <dataset> directory merged into one file. This dataset was derived from 
-      the primary dataset in MiniAOD format linked below.</p>
+      NanoAODRun1 and NanoAOD such that common analyses are possible.</p>
+      <p>The dataset is provided as a collection of root files and in <dataset>_merged.root with the separate files merged into one file. 
+      This dataset was processsed from the primary dataset in AOD format linked below.</p>
   distribution:
     formats:
       - nanoaod-run1
   methodology:
     description: The datasets were produced with the software available in the record linked below.
     links:
-      - recid: 12505 
+      - recid: "12505" 
   use_with:
-    description: This dataset can be used with the following analysis
+    description: Examples are provided to illustrate how CMS Open Data in a NanoAOD or NanoAOD-like format can be analyzed in
     links:
-      - recid: 70001 # placeholder recid for SW record for NanoAODRun1 usage examples
+      - recid: "70001" # placeholder recid for SW record for NanoAODRun1 usage examples
   validation:
     description: >
-      <p>These data were derived from the primary datasets linked below using only the validated runs.</p>
+      <p>These data were processed from the primary dataset using only the validated runs.</p>
 
 POET:
   recid_start: 50000
   title: <dataset> dataset in reduced NanoAOD-like format
   abstract:
     description: >
       <p>This dataset contains information extracted from different physics objects from the 2015 MiniAOD parent <parent_dataset> dataset, 
-      readable with bare <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software. It was produced for the CMS open data workshop tutorials. 
-      It is provided in three different structures: a collection of root files under <dataset> directory as a direct output of POET 
-      (separate trees for each object), <dataset>_flat directory with root files "flattened" to a single tree, as required when used as input to coffea with 
-      nanoevents schema, and in <dataset>_flat_merged.root with the separate files in the <dataset>_flat directory merged into one file. 
+      readable with bare <a href="https://root.cern/">ROOT</a> or other ROOT-compatible software. It was produced for the CMS open data workshop tutorials.</p>
+      <p>It is provided in two different structures: a collection of root files as a direct output of POET 
+      (separate trees for each object), and "_flat.root" files "flattened" to a single tree, as required when used as input to coffea with 
+      nanoevents schema.  <dataset>_flat.root has the separate "_flat.root" files merged into one file. 
       This dataset was derived from the primary dataset in MiniAOD format linked below.</p>
   distribution:
     formats:
       - nanoaod-poet
   methodology:
     description: The datasets were produced with the software available in the record linked below.
     links:
-      - recid: 12502
+      - recid: "12502"
   use_with:
-    description: The use of this dataset does not require any software specific to the CMS experiment. It can be read with the ROOT package
+    description: The use of this dataset does not require any software specific to the CMS experiment. A tutorial lesson is available in
     links:
       - url: https://cms-opendata-workshop.github.io/workshop2022-lesson-ttbarljetsanalysis/
   validation:
     description: >
-      <p>These data were derived from the primary datasets linked below using only the validated runs. 
+      <p>These data were processed from the primary dataset using only the validated runs. 
        No further validation was done for the output.</p>
 
 PFNano:
@@ -85,12 +84,12 @@ PFNano:
   methodology:
     description: The datasets were produced with the software available in the record linked below.
     links:
-      - recid: 12504
+      - recid: "12504"
   use_with:
-    description: This dataset can be used with the following analysis # have to point to standard NanoAOD docs
+    description: Examples are provided to illustrate how CMS Open Data in a NanoAOD or NanoAOD-like format can be analyzed in
     links:
-      - recid: 12361
+      - recid: "12361"
   validation:
     description: >
-      <p>These data were derived from the primary datasets linked below.</p>
-...
+      <p>These data were processed from the primary dataset. The processed runs and lumi sections are in</p>
+...
diff --git a/cms-derived-data/create_cms_derived_datasets.py b/cms-derived-data/create_cms_derived_datasets.py
@@ -41,7 +41,13 @@ def get_file_size(file_path):
     file_size = os.path.getsize(file_path)/1024.0
     return math.ceil(file_size)
 
+def get_collision_information(parent_title):
+    """Return collision information."""
+    cmd = f"cernopendata-client get-metadata --title={parent_title} --output-value collision_information"
+    collision_information = subprocess.getoutput(cmd)
+    return collision_information
 
+
 def get_files(dataset_location):
     "Return file list with information about name, size, location for the given dataset and volume."
     files = []
@@ -97,18 +103,36 @@ def create_record(metadata, data_type):
 
     rec["accelerator"] = config["common_values"]["accelerator"]
 
-    rec["authors"] = config["common_values"]["authors"]
+    rec["authors"] = []
+    rec["authors"].append({
+        "name": config["common_values"]["authors"]
+    })
 
+    rec["collision_information"] = json.loads(metadata["collision_information"])
     rec["collections"] = config["common_values"]["collections"]
 
     #rec["dataset_semantics"] = config["common_values"]["dataset_semantics"]
-    rec["dataset_semantics"] = {}
-    if data_type == "NanoAODRun1":
-        rec["dataset_semantics"]["dataset_semantics_files"] = metadata["dataset_semantics_files"]
-
+    #rec["dataset_semantics_files"] = []
+    #dataset_semantics_files = [ metadata["dataset_semantics_files"]["html_doc"],metadata["dataset_semantics_files"]["json_doc"] ]
+    #if data_type == "NanoAODRun1":
+    #    rec["dataset_semantics"].append({
+    #        "files": dataset_semantics_files
+    #    })
+    if data_type != "POET":
+        rec["dataset_semantics_files"] = {}
+        rec["dataset_semantics_files"]["html_doc"] =  metadata["dataset_semantics_files"]["html_doc"].rsplit('/',1)[1]
+        rec["dataset_semantics_files"]["json_doc"] =  metadata["dataset_semantics_files"]["json_doc"].rsplit('/',1)[1]
+
     rec["date_published"] = config["common_values"]["date_published"]
 
     rec["distribution"] = {}
+    # changes format to nanoaodsim-NNN for MC - relies on having only one format (or nanoaod-NNN as the first)
+    if "Run201" not in metadata["dataset"]:
+        substr = "nanoaod"
+        repl = "nanoaodsim"
+        config[data_type]["distribution"]["formats"][0] = config[data_type]["distribution"]["formats"][0].replace(substr,repl)
+    config[data_type]["distribution"]["formats"].append("root")
+
     rec["distribution"]["formats"] = config[data_type]["distribution"]["formats"]
     rec["distribution"]["number_events"] = metadata["number_events"] 
     rec["distribution"]["number_files"] = metadata["number_files"]
@@ -117,7 +141,9 @@ def create_record(metadata, data_type):
     # uniqely generated for each record (?)
     rec["doi"] = ""
 
-    rec["experiment"] = config["common_values"]["experiment"]
+    rec["experiment"] = [
+        config["common_values"]["experiment"]
+    ]
 
     rec["files"] = metadata["files"]
 
@@ -129,12 +155,12 @@ def create_record(metadata, data_type):
 
     rec["publisher"] = config["common_values"]["publisher"]
 
-    rec["recid"] = metadata["recid"]
+    rec["recid"] = str(metadata["recid"])
 
     rec["relations"] = []
     rec["relations"].append({
         "description": "This dataset was derived from:",
-        "recid": metadata["parent_recid"],
+        "recid":str(metadata["parent_recid"]),
         "type": "isChildOf"
     })
 
@@ -146,10 +172,10 @@ def create_record(metadata, data_type):
 
     rec["validation"] = {}
     rec["validation"]["description"] = config[data_type]["validation"]["description"]
-    rec["validation"]["links"] = []
-    rec["validation"]["links"].append({
-        "recid": metadata["parent_recid"]
-    })
+    # rec["validation"]["links"] = []
+    # rec["validation"]["links"].append({
+    #     "recid": str(metadata["parent_recid"])
+    # })
 
     return rec
 
@@ -165,7 +191,7 @@ def print_records(records):
 @click.option('--data-type',
               '-t',
               required=True,
-              help='Data Type (NanoAODRun1, POET')
+              help='Data Type (NanoAODRun1, POET)')
 def main(data_type):
     "Do the job."
 
@@ -239,6 +265,7 @@ def main(data_type):
 
         # prepare metadata for creating the record
         metadata["parent_recid"] = get_parent_recid(metadata["parent"])
+        metadata["collision_information"] = get_collision_information(metadata["parent"])
         metadata["number_events"] = number_events
         metadata["number_files"] = number_files
         metadata["size"] = size