Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace system calls by xrootdpyfs #236

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,46 +1,44 @@
# Environments
.env
.venv
env/
.venv
venv/

*.pyc
*.err
*.pyc
cms-2010-collision-datasets/outputs/*.json
cms-2010-simulated-datasets/outputs/*.json
cms-2011-collision-datasets/code/das.py
cms-2011-collision-datasets/inputs/das-json-store
cms-2011-collision-datasets/outputs/*.xml
cms-2011-collision-datasets-runb-update/inputs/config-store
cms-2011-collision-datasets-runb-update/inputs/das-json-config-store
cms-2011-collision-datasets-runb-update/inputs/das-json-store
cms-2011-collision-datasets-runb-update/outputs/*.json
cms-2011-collision-datasets/code/das.py
cms-2011-collision-datasets/inputs/das-json-store
cms-2011-collision-datasets/outputs/*.xml
cms-2011-hlt-triggers/outputs/*.html
cms-2011-hlt-triggers/outputs/*.xml
cms-2011-l1-triggers/outputs/*.xml
cms-2011-simulated-datasets/inputs/das-json-store
cms-2011-simulated-datasets/outputs/*.xml
cms-2012-collision-datasets/inputs/das-json-store
cms-2012-collision-datasets/outputs/*.json
cms-2012-collision-datasets-update/inputs/config-store
cms-2012-collision-datasets-update/inputs/das-json-config-store
cms-2012-collision-datasets-update/inputs/das-json-store
cms-2012-collision-datasets-update/outputs/*.json
cms-2012-collision-datasets/inputs/das-json-store
cms-2012-collision-datasets/outputs/*.json
cms-2012-event-display-files/inputs/ig/
cms-2012-event-display-files/outputs/*.json
cms-2012-simulated-datasets/inputs/config-store
cms-2012-simulated-datasets/inputs/das-json-store
cms-2012-simulated-datasets/outputs/*.json
cms-2012-simulated-datasets/outputs/create-config-store.sh
cms-2012-simulated-datasets/outputs/create-das-json-store.sh
cms-2013-collision-datasets-hi-ppref/inputs/config-store
cms-2013-collision-datasets-hi-ppref/inputs/das-json-config-store
cms-2013-collision-datasets-hi-ppref/inputs/das-json-store
cms-2013-collision-datasets-hi-ppref/outputs/*.json
cms-2012-simulated-datasets/outputs/*.json
cms-2013-collision-datasets-hi/inputs/config-store
cms-2013-collision-datasets-hi/inputs/das-json-config-store
cms-2013-collision-datasets-hi/inputs/das-json-store
cms-2013-collision-datasets-hi/outputs/*.json
cms-2013-collision-datasets-hi-ppref/inputs/config-store
cms-2013-collision-datasets-hi-ppref/inputs/das-json-config-store
cms-2013-collision-datasets-hi-ppref/inputs/das-json-store
cms-2013-collision-datasets-hi-ppref/outputs/*.json
cms-2013-hlt-triggers/outputs
cms-2013-simulated-datasets-hi/inputs/config-store
cms-2013-simulated-datasets-hi/inputs/das-json-store
Expand All @@ -58,24 +56,28 @@ cms-2015-simulated-datasets/inputs/das-json-store
cms-2015-simulated-datasets/inputs/mcm-store
cms-2015-simulated-datasets/lhe_generators
cms-2015-simulated-datasets/outputs/
cms-2016-collision-datasets/inputs/config-store
cms-2016-collision-datasets/inputs/das-json-config-store
cms-2016-collision-datasets/inputs/das-json-store
cms-2016-collision-datasets/inputs/hlt-config-store
cms-2016-collision-datasets/outputs/*.json
cms-2016-pileup-dataset/cookies.txt
cms-2016-pileup-dataset/inputs/config-store
cms-2016-pileup-dataset/inputs/das-json-store
cms-2016-pileup-dataset/inputs/mcm-store
cms-2016-pileup-dataset/outputs/
cms-2016-simulated-datasets/cookies.txt
cms-2016-simulated-datasets/inputs/config-store
cms-2016-simulated-datasets/inputs/das-json-store
cms-2016-simulated-datasets/inputs/mcm-store
cms-2016-simulated-datasets/lhe_generators
cms-2016-simulated-datasets/outputs/
cms-YYYY-simulated-datasets/cache
cms-YYYY-simulated-datasets/outputs/*.csv
cms-YYYY-simulated-datasets/outputs/*.err
cms-YYYY-simulated-datasets/outputs/*.json
cod2-to-cod3/outputs/*.json
cod2-to-cod3/outputs/*.json
jade-2023-raw-datasets/outputs/*.json
opera-2017-multiplicity-studies/outputs/opera-events.json
opera-2017-multiplicity-studies/outputs/opera-events.json
opera-2019-electron-neutrinos/outputs/opera-events.json
opera-2019-neutrino-induced-charm/outputs/opera-events.json
cms-2016-collision-datasets/inputs/config-store
cms-2016-collision-datasets/inputs/hlt-config-store
cms-2016-collision-datasets/inputs/das-json-store
cms-2016-collision-datasets/inputs/das-json-config-store
cms-2016-collision-datasets/outputs/*.json
cms-2016-pileup-dataset/inputs/config-store
cms-2016-pileup-dataset/inputs/das-json-store
cms-2016-pileup-dataset/inputs/mcm-store
cms-2016-pileup-dataset/outputs/
cms-2016-pileup-dataset/cookies.txt
90 changes: 90 additions & 0 deletions cms-2016-simulated-datasets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# cms-2016-simulated-datasets

This directory contains helper scripts used to prepare CMS 2016 open data
release regarding MC simulated datasets.

- `code/` folder contains the python code;
- `inputs/` folder contains input text files with the list of datasets for each
year and input files;
- `outputs/` folder contains generated JSON records to be included as the CERN
Open Data portal fixtures.

Every step necessary to produce the final `*.json` files is handled by the
`cmc-mc/interface.py` script. Details about it can be queried with the command:

```console
$ python3 code/interface.py --help
```

Please make sure to get the VOMS proxy file before running these scripts:

```console
$ voms-proxy-init --voms cms --rfc --valid 190:00
```

Please make sure to set the EOS instance to EOSPUBLIC before running these scripts:

```console
$ export EOS_MGM_URL=root://eospublic.cern.ch
```
Please make sure to have a valid `userkey.nodes.pem` certificate present in
`$HOME/.globus`. If not, you have to run the following on top of the regular
CMS certificate documentation:

```console
$ cd $HOME/.globus
$ ls userkey.nodes.pem
$ openssl pkcs12 -in myCert.p12 -nocerts -nodes -out userkey.nodes.pem # if not present
$ cd -
```

Warning: Creating the full local cache might take a long time.

First step is to create EOS file index cache:

```console
$ python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
```

This requires the data files to be placed in their final location. However, for
early testing on LXPLUS, all steps can be run without the EOS file index cache
by means of adding the command-line option `--ignore-eos-store` to the commands below.

We can now build sample records by doing:

```console
$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt

$ auth-get-sso-cookie -u https://cms-pdmv.cern.ch/mcm -o cookies.txt
$ python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt

$ python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt

$ python3 code/lhe_generators.py

$ python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
```

Note that to build the test records an (empty) input file for DOIs and a recid
info file must be present in the inputs directory.

Each step builds a subdirectory with a cache (`das-json-store`, `mcm-store` and
`config-store`). They are large, do not upload them to the repository, respect
the `.gitignore`.

The output JSON files for the dataset records will be generated in the
`outputs` directory.

## lhe_generators


```console
$ python3 code/lhe_generators.py >& output
```

- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`.
- It works on LXPLUS or with mounted EOS.
- Number of threads is set to 20 which is ideal for LXPLUS.

> :warning: There are many cases with various steps to get generator parameters for LHE -see [#97](https://github.com/cernopendata/data-curation/issues/97)-. Thus, in some few cases, the script MIGHT not work as expected so make sure to read it, check errors, and make any necessary tweaks
81 changes: 81 additions & 0 deletions cms-2016-simulated-datasets/code/conffiles_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python


"""
Create MC 2016 records.
"""

import hashlib
import json
import re
import os
import subprocess
import sys
from urllib.request import urlopen

from utils import get_from_deep_json, \
populate_doiinfo, \
get_dataset_format, \
get_dataset_year, \
get_author_list_recid, \
get_doi
from das_json_store import get_das_store_json, \
get_parent_dataset
from eos_store import XROOTD_URI_BASE, \
get_dataset_index_file_base, \
get_dataset_location
from mcm_store import get_mcm_dict, \
get_global_tag, \
get_genfragment_url, \
get_generator_name, \
get_dataset_energy, \
get_cmsDriver_script
from config_store import get_conffile_ids_all_chain_steps
from categorisation import guess_title_category
from dataset_records import get_dataset, \
newer_dataset_version_exists


def create_record(conf_id, conffiles_dir):
"""Create record for the given dataset."""

rec = {}

with open(conffiles_dir + '/' + conf_id + '.configFile') as myfile:
rec['cms_confdb_id'] = conf_id
rec['script'] = myfile.read()

return rec


def create_records(conf_ids, conffiles_dir):
"""Create records."""

records = []
for conf_id in conf_ids:
records.append(create_record(conf_id, conffiles_dir))
return records


def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir):
"Do the job."

dataset_full_names = []
for dataset_full_name in datasets:
if newer_dataset_version_exists(dataset_full_name, datasets):
print('[ERROR] Ignoring older dataset version ' + dataset_full_name,
file=sys.stderr)
else:
dataset_full_names.append(dataset_full_name)

conffiles = []
for ds in dataset_full_names:
# this returns config_ids for all steps in the processing chain of the dataset
config_ids = get_conffile_ids_all_chain_steps(ds, mcm_dir)
if config_ids:
for config_id in config_ids:
if config_id not in conffiles:
conffiles.append(config_id)

records = create_records(conffiles, conffiles_dir)
json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout)
76 changes: 76 additions & 0 deletions cms-2016-simulated-datasets/code/config_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import subprocess
import sys

from eos_store import check_datasets_in_eos_dir
from mcm_store import get_conffile_ids_from_mcm
from utils import get_from_deep_json


def get_conffile_ids_all_chain_steps(dataset, mcm_dir):
"""Return location of the configuration files for the dataset."""
ids = {}
path = mcm_dir + '/chain/' + dataset.replace('/', '@')
try:
step_dirs = os.listdir(path)
except FileNotFoundError:
return []
for step in step_dirs:
step_dir = path + '/' + step
mcm_config_ids = get_conffile_ids_from_mcm(dataset, step_dir)

if mcm_config_ids:
for someid in mcm_config_ids:
ids[someid] = 1

return list(ids.keys())


def main(eos_dir,
mcm_dir,
conf_dir,
datasets,
ignore_eos_store):
"Do the job"

# only for the datasets with EOS file information
if ignore_eos_store:
eos_datasets = datasets.copy()
else:
eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)

conffile_ids = []
for dataset_full_name in eos_datasets:
if dataset_full_name.endswith('MINIAODSIM') == 0:
for conffile_id in get_conffile_ids_all_chain_steps(dataset_full_name, mcm_dir):
if conffile_id not in conffile_ids:
conffile_ids.append(conffile_id)

if not os.path.exists(conf_dir):
os.makedirs(conf_dir, exist_ok=True)

key_nodes = "~/.globus/userkey.nodes.pem"
cert = "~/.globus/usercert.pem"

total = len(conffile_ids)
i = 1
for conffile_id in conffile_ids:
filepath="{}/{}.configFile".format(conf_dir, conffile_id)
if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
print("==> " + conffile_id + ".configFile\n==> Already exist. Skipping...")
i += 1
continue

print("Getting ({}/{}) {}/{}.configFile".format(i, total, conf_dir, conffile_id))

cmd = "curl -s -k --key {key} --cert {cert} https://cmsweb.cern.ch/couchdb/reqmgr_config_cache/{conffile_id}/configFile".format(conffile_id=conffile_id, key=key_nodes, cert=cert)
conffile = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

confs = conffile.stdout.decode("utf-8")
if confs:
with open(filepath, 'w') as outfile:
outfile.write(confs)
else:
print("[ERROR] Empty conf file for {ds}".format(ds=conffile_id), file=sys.stderr)

i += 1
Loading
Loading