Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cms-2016-collision-datasets: adds full provenance #222

Merged
merged 5 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions cms-2016-collision-datasets/code/create-das-json-config-store.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ while IFS= read -r dataset; do
done
done < ./inputs/cms-2016-collision-datasets.txt

# get the config file info for the RECO step (the parent of MINI)
minis=$(cat inputs/cms-2016-collision-datasets.txt | grep '/MINI')
for dataset in $minis; do
parent=$(dasgoclient -query "parent dataset=$dataset")
dataset_result_file=$(echo $parent | tr '/' '@')
okay=0
while [ $okay -lt 1 ]; do
echo "==> DAS config dataset=$parent"
dasgoclient -query "config dataset=$parent" -json > ./inputs/das-json-config-store/"${dataset_result_file}.json"
if [ $? -eq 0 ]; then
okay=1
fi
done
done

# extract configuration file URLs
rm -f temp_urls
for file in $(ls -1 inputs/das-json-config-store/*.json); do
Expand All @@ -26,3 +41,9 @@ cat temp_urls | sort -u > urls
# download configuration files
mkdir -p ./inputs/config-store
cat urls | awk -F/ '{print "curl -o ./inputs/config-store/"$6".configFile -k --key ~/.globus/userkey.nodes.pem --cert ~/.globus/usercert.pem " $0}' | bash

# remove config files with process HARVESTING
configs_harvesting=$(grep -l HARVESTING inputs/config-store/*)
for c in $configs_harvesting; do
rm $c;
done
8 changes: 8 additions & 0 deletions cms-2016-collision-datasets/code/create-das-json-store.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ while IFS= read -r dataset; do
dataset_result_file=$(echo $dataset | tr '/' '@')
dasgoclient -query "dataset=$dataset" -json > ./inputs/das-json-store/"${dataset_result_file}.json"
done < ./inputs/cms-2016-collision-datasets.txt

# minis=$(cat inputs/cms-2016-collision-datasets.txt | grep '/MINI')
# for dataset in $minis; do
# parent=$(dasgoclient -query "parent dataset=$dataset")
# dataset_result_file=$(echo $parent | tr '/' '@')
# dasgoclient -query "dataset=$parent" -json > ./inputs/das-json-store/"${dataset_result_file}.json"
# done

143 changes: 108 additions & 35 deletions cms-2016-collision-datasets/code/create_cms_2016_collision_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
get_dataset_location,
)

FWYZARD = {}
DATASET_TRIGGER_LIST = {}

SELECTION_DESCRIPTIONS = {}

Expand Down Expand Up @@ -129,17 +129,17 @@ def get_file_checksum(afile):
return hex(zlib.adler32(open(afile, "rb").read(), 1) & 0xFFFFFFFF)[2:]


def populate_fwyzard():
"""Populate FWYZARD dictionary (dataset -> trigger list)."""
def populate_dataset_trigger_list():
"""Populate DATASET_TRIGGER_LIST dictionary (dataset -> trigger list)."""
for line in open("./inputs/hlt-2016-dataset.txt", "r").readlines():
line = line.strip()
dataset, trigger = line.split(",")
if trigger.endswith("_v"):
trigger = trigger[:-2]
if dataset in FWYZARD.keys():
FWYZARD[dataset].append(trigger)
if dataset in DATASET_TRIGGER_LIST.keys():
DATASET_TRIGGER_LIST[dataset].append(trigger)
else:
FWYZARD[dataset] = [
DATASET_TRIGGER_LIST[dataset] = [
trigger,
]

Expand Down Expand Up @@ -170,7 +170,13 @@ def populate_selection_descriptions():

def get_release_for_processing(dataset_full_name):
"""Return CMSSW release info for the given dataset for the processing step."""
return "CMSSW_10_6_26"
p = subprocess.run(
["dasgoclient", "-query", f"release dataset={dataset_full_name}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
release = p.stdout.decode().strip()
return release


def get_release_for_system_details(dataset_full_name):
Expand All @@ -187,6 +193,7 @@ def get_global_tag_for_processing(dataset_full_name):
m = re.search(pattern, content)
if m:
processing_global_tag = m.group(1)
processing_global_tag = processing_global_tag.strip()
return processing_global_tag


Expand Down Expand Up @@ -236,12 +243,27 @@ def get_dataset_config_file_name(dataset_full_name):
run_period = dataset_full_name.split("/")[2].split("-", 1)[0]
version = dataset_full_name.split("/")[2].split("-")[1]
config_file = f"ReReco-{run_period}-{dataset}-{version}"
if "/AOD" in dataset_full_name:
config_file = f"recoskim_{run_period}_{dataset}"
if "DoubleMuonLowMass" in dataset_full_name:
config_file = f"ReReco-{run_period}-{dataset}-{version}"
return config_file


def get_parent_dataset(dataset_full_name):
"""Return the parent dataset for the given dataset."""
p = subprocess.run(
["dasgoclient", "-query", f"parent dataset={dataset_full_name}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
parent = p.stdout.decode().strip()
return parent


def create_selection_information(dataset, dataset_full_name):
"""Create box with selection information."""
if "MINIAOD" in dataset_full_name:
if "/MINIAOD" in dataset_full_name:
aodformat = "MINIAOD"
else:
aodformat = "NANOAOD"
Expand All @@ -253,31 +275,49 @@ def create_selection_information(dataset, dataset_full_name):
# data taking / HLT:
out += "<p><strong>Data taking / HLT</strong>"
out += '<br/>The collision data were assigned to different RAW datasets using the following <a href="/record/30300">HLT configuration</a>.</p>'
# data processing / RECO:
run_period = re.search(r"(Run[0-9]+.)", dataset_full_name).groups()[0]
afile = get_dataset_config_file_name(dataset_full_name)
# data processing / NANO/PAT/RECO:
aodformat = dataset_full_name.split("/")[3]
process = "PAT"
processing_source = "RAW"
step_dataset = dataset_full_name
steps = []
if aodformat == "NANOAOD":
process = "NANO"
processing_source = "MINIAOD"
generator_text = "Configuration file for " + process + " step " + afile
release = get_release_for_processing(dataset_full_name)
global_tag = get_global_tag_for_processing(dataset_full_name)
out += f"<p><strong>Data processing / {process}</strong>"
steps = [
{"process": "NANO"},
{"process": "PAT"},
{"process": "RECO"}
]
else:
steps = [
{"process": "PAT"},
{"process": "RECO"}
]

out += "<p><strong>Data processing </strong>"
out += (
"<br/>This primary %s dataset was processed from the %s dataset by the following step: "
% (aodformat, processing_source)
"<br/>This %s dataset was processed from the RAW dataset by the following steps: "
% (aodformat)
)
out += "<br/>Step: %s" % process
out += "<br/>Release: %s" % release
out += "<br/>Global tag: %s" % global_tag
out += '\n <br/><a href="/record/%s">%s</a>' % (
LINK_INFO.get(afile, ""),
generator_text,
)
out += "\n </p>"
out += "<br/>"

afile = get_dataset_config_file_name(dataset_full_name)
step_dataset = dataset_full_name
for i in range(len(steps)):
generator_text = "Configuration file for " + steps[i]['process'] + " step " + afile
release = get_release_for_processing(step_dataset)
global_tag = get_global_tag_for_processing(step_dataset)

out += "<br/><strong>Step %s </strong>" % steps[i]['process']
out += "<br/>Release: %s" % release
out += "<br/>Global tag: %s" % global_tag
out += '\n <br/><a href="/record/%s">%s</a>' % (
LINK_INFO.get(afile, ""),
generator_text,
)
out += "<br/>Output dataset: %s" % step_dataset
out += "\n </p>"
if steps[i]['process'] != "RECO":
step_dataset = get_parent_dataset(step_dataset)
afile = get_dataset_config_file_name(step_dataset)

# HLT trigger paths:
out += "<p><strong>HLT trigger paths</strong>"
out += '<br/>The possible <a href="/docs/cms-guide-trigger-system#hlt-trigger-path-definitions">HLT trigger paths</a> in this dataset are:'
Expand All @@ -293,7 +333,7 @@ def create_selection_information(dataset, dataset_full_name):

def get_trigger_paths_for_dataset(dataset):
"""Return list of trigger paths for given dataset."""
return FWYZARD.get(dataset, [])
return DATASET_TRIGGER_LIST.get(dataset, [])


def get_dataset_index_files(dataset_full_name):
Expand All @@ -319,6 +359,29 @@ def get_dataset_index_files(dataset_full_name):
return files


def get_dataset_semantics_doc(dataset_name, sample_file_path, recid):
"""Produce the dataset semantics files and return their data-curation paths for the given dataset."""
output_dir = f"outputs/docs/NanoAOD/{recid}"
eos_dir = f"/eos/opendata/cms/dataset-semantics/NanoAOD/{recid}"
isExist = os.path.exists(output_dir)
if not isExist:
os.makedirs(output_dir)

script = "inspectNanoFile.py"

html_doc_path = f"{output_dir}/{dataset_name}_doc.html"
cmd = f"python3 external-scripts/{script} --doc {html_doc_path} {sample_file_path}"
output = subprocess.getoutput(cmd)
html_eos_path = f"{eos_dir}/{dataset_name}_doc.html"

json_doc_path = f"{output_dir}/{dataset_name}_doc.json"
cmd = f"python3 external-scripts/{script} --json {json_doc_path} {sample_file_path}"
output = subprocess.getoutput(cmd)
json_eos_path = f"{eos_dir}/{dataset_name}_doc.json"

return {"url": html_eos_path, "json": json_eos_path}


def get_doi(dataset_full_name):
"Return DOI for the given dataset."
return DOI_INFO.get(dataset_full_name, "")
Expand All @@ -343,6 +406,7 @@ def create_record(recid, run_period, version, dataset, aodformat):
% aodformat
+ "</p><p>The list of validated runs, which must be applied to all analyses, either with the full validation or for an analysis requiring only muons, can be found in:</p>"
)

rec["abstract"]["links"] = [
{"description": "Validated runs, full validation", "recid": "14220"},
{"description": "Validated runs, muons only", "recid": "14221"},
Expand All @@ -362,6 +426,13 @@ def create_record(recid, run_period, version, dataset, aodformat):
rec["collision_information"]["energy"] = COLLISION_ENERGY
rec["collision_information"]["type"] = COLLISION_TYPE

if aodformat == "NANOAOD":
dataset_path = f"/eos/opendata/cms/{run_period}/{dataset}/NANOAOD/{version}"
intermediate_dir = os.listdir(dataset_path)
sample_file_path = f"{dataset_path}/{intermediate_dir[0]}"
sample_file_with_path = f"{sample_file_path}/{os.listdir(sample_file_path)[0]}"
rec["dataset_semantics_files"] = get_dataset_semantics_doc(dataset, sample_file_with_path, recid)

rec["date_created"] = [
YEAR_CREATED,
]
Expand All @@ -377,7 +448,9 @@ def create_record(recid, run_period, version, dataset, aodformat):

rec["doi"] = get_doi(dataset_full_name)

rec["experiment"] = "CMS"
rec["experiment"] = [
"CMS"
]

rec["files"] = []

Expand Down Expand Up @@ -472,15 +545,15 @@ def create_record(recid, run_period, version, dataset, aodformat):
rec["usage"]["links"] = [
{
"description": "Running CMS analysis code using Docker",
"url": "/docs/cms-guide-docker",
"url": "/docs/cms-guide-docker#nanoaod" if aodformat == "NANOAOD" else "/docs/cms-guide-docker#images",
},
{
"description": "How to install the CMS Virtual Machine",
"url": "/docs/cms-virtual-machine-2016-2018",
"url": "/docs/cms-virtual-machine-cc7",
},
{
"description": "Getting started with CMS open data",
"url": "/docs/cms-getting-started-2016-2018",
"url": "/docs/cms-getting-started-miniaod",
},
]

Expand Down Expand Up @@ -513,7 +586,7 @@ def print_records(records):
@click.command()
def main():
"Do the job."
populate_fwyzard()
populate_dataset_trigger_list()
populate_doiinfo()
populate_containerimages_cache()
populate_selection_descriptions()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@


RECID_START = 30400
RECID_MAX = 30500 # when this record ID number is reached, continue from the "next" number
RECID_NEXT = 30566 # next free record ID number
YEAR_CREATED = "2016"
YEAR_PUBLISHED = "2024"
COLLISION_ENERGY = "13Tev"
Expand Down Expand Up @@ -126,12 +128,12 @@ def main():
# Skip non-RECO files
afile_python_filename = get_python_filename(afile)

if not afile_python_filename.startswith("ReReco"):
if not afile_python_filename.startswith("ReReco") and not afile_python_filename.startswith("recoskim"):
continue

if afile_python_filename in files_seen:
continue

files_seen.append(afile_python_filename)

# Create nice reco_*.py files for copying them over to EOSPUBLIC
Expand Down Expand Up @@ -170,7 +172,9 @@ def main():
rec["distribution"]["number_files"] = 1
rec["distribution"]["size"] = get_size(afile)

rec["experiment"] = "CMS"
rec["experiment"] = [
"CMS"
]

rec["files"] = [
{
Expand Down Expand Up @@ -207,6 +211,10 @@ def main():
)
recid += 1

# jump over some record ID range which were already preselected for collision data
if recid == RECID_MAX:
recid = RECID_NEXT

fdesc.write("}\n")
fdesc.close()

Expand Down
Loading
Loading