diff --git a/datasets/0.8/audio_test/metadata.json b/datasets/0.8/audio_test/metadata.json index 4ffb48c3d..ee2c7e999 100644 --- a/datasets/0.8/audio_test/metadata.json +++ b/datasets/0.8/audio_test/metadata.json @@ -40,6 +40,7 @@ "@type": "sc:Dataset", "name": "audio_test", "description": "This is the basic test case for audio files", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/0.8/bigcode-the-stack/metadata.json b/datasets/0.8/bigcode-the-stack/metadata.json index 23a62bac9..c3a004f52 100644 --- a/datasets/0.8/bigcode-the-stack/metadata.json +++ b/datasets/0.8/bigcode-the-stack/metadata.json @@ -41,6 +41,7 @@ "description": "The Stack contains over 6TB of permissively-licensed source code files covering 358 programming languages. The dataset was created as part of the BigCode Project, an open scientific collaboration working on the responsible development of Large Language Models for Code (Code LLMs). The Stack serves as a pre-training dataset for Code LLMs, i.e., code-generating AI systems which enable the synthesis of programs from natural language descriptions as well as other from code snippets.", "citation": "@article{Kocetkov2022TheStack, title={The Stack: 3 TB of permissively licensed source code}, author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Mu\u00f1oz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm}, journal={Preprint}, year={2022} }", "license": "other", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/bigcode/the-stack", "distribution": [ { diff --git a/datasets/0.8/coco2014-mini/metadata.json b/datasets/0.8/coco2014-mini/metadata.json index 49aa53f57..dc5847aea 100644 --- a/datasets/0.8/coco2014-mini/metadata.json +++ b/datasets/0.8/coco2014-mini/metadata.json @@ -42,8 +42,9 @@ "description": "Smaller downloadable version of COCO to be used in unit tests.", "citation": "None", "license": "cc-by-4.0", - "version": "1.0.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", + "version": "1.0.0", "distribution": [ { "@type": "sc:FileObject", diff --git a/datasets/0.8/coco2014/metadata.json b/datasets/0.8/coco2014/metadata.json index 5aa1f6ebe..2007832a7 100644 --- a/datasets/0.8/coco2014/metadata.json +++ b/datasets/0.8/coco2014/metadata.json @@ -42,6 +42,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/datasets/0.8/fashion-mnist/metadata.json b/datasets/0.8/fashion-mnist/metadata.json index 0d4f70e5a..e79d99bad 100644 --- a/datasets/0.8/fashion-mnist/metadata.json +++ b/datasets/0.8/fashion-mnist/metadata.json @@ -41,6 +41,7 @@ "description": "Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of\n60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image,\nassociated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in\nreplacement for the original MNIST dataset for benchmarking machine learning algorithms.\nIt shares the same image size and structure of training and testing splits.\n", "citation": "@article{DBLP:journals/corr/abs-1708-07747,\n author = {Han Xiao and\n Kashif Rasul and\n Roland Vollgraf},\n title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning\n Algorithms},\n journal = {CoRR},\n volume = {abs/1708.07747},\n year = {2017},\n url = {http://arxiv.org/abs/1708.07747},\n archivePrefix = {arXiv},\n eprint = {1708.07747},\n timestamp = {Mon, 13 Aug 2018 16:47:27 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07747},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "license": "mit", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/fashion_mnist", "distribution": [ { @@ -96,4 +97,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/datasets/0.8/flores-200/metadata.json b/datasets/0.8/flores-200/metadata.json index 7dfd11567..bbd74a8d6 100644 --- a/datasets/0.8/flores-200/metadata.json +++ b/datasets/0.8/flores-200/metadata.json @@ -45,6 +45,7 @@ "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}" ], "license": "cc-by-sa-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/facebookresearch/flores", "version": "0.0.1", "distribution": [ diff --git a/datasets/0.8/gpt-3/metadata.json b/datasets/0.8/gpt-3/metadata.json index cced1571f..6656dd570 100644 --- a/datasets/0.8/gpt-3/metadata.json +++ b/datasets/0.8/gpt-3/metadata.json @@ -41,6 +41,7 @@ "description": "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions \u2013 something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", "citation": "@article{brown2020language, title={Language Models are Few-Shot Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei}, year={2020}, eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }", "license": "unknown", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/openai/gpt-3", "distribution": [ { diff --git a/datasets/0.8/huggingface-c4/metadata.json b/datasets/0.8/huggingface-c4/metadata.json index fc7246b47..f3f3f2c41 100644 --- a/datasets/0.8/huggingface-c4/metadata.json +++ b/datasets/0.8/huggingface-c4/metadata.json @@ -41,6 +41,7 @@ "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "citation": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", "license": "odc-by", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/c4", "version": "0.0.0", "distribution": [ diff --git a/datasets/0.8/huggingface-mnist/metadata.json b/datasets/0.8/huggingface-mnist/metadata.json index 95a02d5d9..5b491960b 100644 --- a/datasets/0.8/huggingface-mnist/metadata.json +++ b/datasets/0.8/huggingface-mnist/metadata.json @@ -41,6 +41,7 @@ "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citation": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", "license": "mit", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/mnist", "version": "1.0.0", "distribution": [ diff --git a/datasets/0.8/movielens/metadata.json b/datasets/0.8/movielens/metadata.json index e36d509b4..a70b01dc3 100644 --- a/datasets/0.8/movielens/metadata.json +++ b/datasets/0.8/movielens/metadata.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "Movielens-25M", "description": "MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags. Released 12/2019", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://grouplens.org/datasets/movielens/25m/", "distribution": [ { diff --git a/datasets/0.8/pass-mini/metadata.json b/datasets/0.8/pass-mini/metadata.json index 057ced2cd..f66a0f40d 100755 --- a/datasets/0.8/pass-mini/metadata.json +++ b/datasets/0.8/pass-mini/metadata.json @@ -41,6 +41,7 @@ "description": "Smaller downloadable version of PASS to be used in unit tests.", "citation": "None", "license": "None", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/0.8/pass/metadata.json b/datasets/0.8/pass/metadata.json index 062a11167..1feb13325 100755 --- a/datasets/0.8/pass/metadata.json +++ b/datasets/0.8/pass/metadata.json @@ -41,6 +41,7 @@ "description": "PASS is a large-scale image dataset that does not include any humans and which can be used for high-quality pretraining while significantly reducing privacy concerns.", "citation": "@Article{asano21pass, author = \"Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi\", title = \"PASS: An ImageNet replacement for self-supervised pretraining without humans\", journal = \"NeurIPS Track on Datasets and Benchmarks\", year = \"2021\" }", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.robots.ox.ac.uk/~vgg/data/pass/", "distribution": [ { diff --git a/datasets/0.8/recipes/compressed_archive.json b/datasets/0.8/recipes/compressed_archive.json index 47e7b6dec..2015dcbb5 100644 --- a/datasets/0.8/recipes/compressed_archive.json +++ b/datasets/0.8/recipes/compressed_archive.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "compressed_archive_example", "description": "This is a fairly minimal example, showing a way to describe archive files.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/recipes/compressed_archive/about", "distribution": [ { diff --git a/datasets/0.8/recipes/enum.json b/datasets/0.8/recipes/enum.json index e4ac6dfc2..160c6ae5b 100644 --- a/datasets/0.8/recipes/enum.json +++ b/datasets/0.8/recipes/enum.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "enum_example", "description": "This is a fairly minimal example, showing a way to describe enumerations.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/enum/about", "distribution": [ { diff --git a/datasets/0.8/recipes/file_object_in_zip.json b/datasets/0.8/recipes/file_object_in_zip.json index cbac32f94..83b38c91e 100644 --- a/datasets/0.8/recipes/file_object_in_zip.json +++ b/datasets/0.8/recipes/file_object_in_zip.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "file_object_in_zip", "description": "Minimal example to read a FileObject contained in a zip.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/minimal.json b/datasets/0.8/recipes/minimal.json index 988e64ec1..b1993bc41 100644 --- a/datasets/0.8/recipes/minimal.json +++ b/datasets/0.8/recipes/minimal.json @@ -39,5 +39,6 @@ "@type": "sc:Dataset", "name": "minimal_example", "description": "This is a very minimal example, with only the required fields.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/minimal/about" } diff --git a/datasets/0.8/recipes/minimal_recommended.json b/datasets/0.8/recipes/minimal_recommended.json index 99bcff590..f90e506b6 100644 --- a/datasets/0.8/recipes/minimal_recommended.json +++ b/datasets/0.8/recipes/minimal_recommended.json @@ -40,6 +40,7 @@ "name": "minimal_example_with_recommended_fields", "description": "This is a minimal example, including the required and the recommended fields.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/recipes/minimal-recommended", "distribution": [ { diff --git a/datasets/0.8/recipes/read_binary_file_by_line.json b/datasets/0.8/recipes/read_binary_file_by_line.json index dabb7164f..27af1bde3 100644 --- a/datasets/0.8/recipes/read_binary_file_by_line.json +++ b/datasets/0.8/recipes/read_binary_file_by_line.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "read_binary_file_by_line", "description": "This is a recipe illustrating how to read files line by line.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/read_from_directory.json b/datasets/0.8/recipes/read_from_directory.json index 930cabb9a..38b08f30e 100644 --- a/datasets/0.8/recipes/read_from_directory.json +++ b/datasets/0.8/recipes/read_from_directory.json @@ -40,6 +40,7 @@ "@type": "sc:Dataset", "name": "read_from_directory", "description": "Minimal example showing how to read from local directories.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/mlcommons/croissant", "distribution": [ { diff --git a/datasets/0.8/recipes/read_from_tar.json b/datasets/0.8/recipes/read_from_tar.json index 3e81705ce..89f306b96 100644 --- a/datasets/0.8/recipes/read_from_tar.json +++ b/datasets/0.8/recipes/read_from_tar.json @@ -39,6 +39,7 @@ "@type": "sc:Dataset", "name": "read_from_tar", "description": "Example dataset to read several FileSets from a tar.gz and join them.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/simple-split.json b/datasets/0.8/recipes/simple-split.json index ffc589cff..1e93d84db 100644 --- a/datasets/0.8/recipes/simple-split.json +++ b/datasets/0.8/recipes/simple-split.json @@ -41,6 +41,7 @@ "name": "simple-split", "description": "An artificial example dataset defining splits from a CSV column", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/simple-join/metadata.json b/datasets/0.8/simple-join/metadata.json index 754ca656d..fc44a1e42 100644 --- a/datasets/0.8/simple-join/metadata.json +++ b/datasets/0.8/simple-join/metadata.json @@ -40,6 +40,7 @@ "name": "simple-join", "description": "Example to showcase the use of join.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/simple-parquet/metadata.json b/datasets/0.8/simple-parquet/metadata.json index 4de3b5050..15ff43f8e 100644 --- a/datasets/0.8/simple-parquet/metadata.json +++ b/datasets/0.8/simple-parquet/metadata.json @@ -40,6 +40,7 @@ "name": "simple-parquet", "description": "Example to read Parquet files.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/titanic/metadata.json b/datasets/0.8/titanic/metadata.json index 4733fb231..3468faecd 100644 --- a/datasets/0.8/titanic/metadata.json +++ b/datasets/0.8/titanic/metadata.json @@ -42,6 +42,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/datasets/0.8/wiki-text/metadata.json b/datasets/0.8/wiki-text/metadata.json index ff57b65da..aad22e37a 100644 --- a/datasets/0.8/wiki-text/metadata.json +++ b/datasets/0.8/wiki-text/metadata.json @@ -42,6 +42,7 @@ "description": "The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\n\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.", "citation": "@article{merity2016pointer, title={Pointer sentinel mixture models}, author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, journal={arXiv preprint arXiv:1609.07843}, year={2016} }", "license": "cc-by-sa-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/", "distribution": [ { diff --git a/datasets/0.8/world-happiness/metadata.json b/datasets/0.8/world-happiness/metadata.json index 3bf4ab1a9..08de0472f 100644 --- a/datasets/0.8/world-happiness/metadata.json +++ b/datasets/0.8/world-happiness/metadata.json @@ -41,6 +41,7 @@ "description": "Happiness scored according to economic production, social support, etc.", "citation": "None", "license": "cc0-1.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.kaggle.com/datasets/unsdsn/world-happiness", "distribution": [ { diff --git a/datasets/1.0/audio_test/metadata.json b/datasets/1.0/audio_test/metadata.json index 8b0a8c5a2..e2cbf9637 100644 --- a/datasets/1.0/audio_test/metadata.json +++ b/datasets/1.0/audio_test/metadata.json @@ -46,8 +46,9 @@ }, "@type": "sc:Dataset", "name": "audio_test", - "description": "This is the basic test case for audio files", "conformsTo": "http://mlcommons.org/croissant/1.0", + "description": "This is the basic test case for audio files", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { @@ -69,10 +70,10 @@ "description": "These are the sounds.", "dataType": "sc:AudioObject", "source": { + "fileSet": "files", "extract": { "fileProperty": "content" - }, - "fileSet": "files" + } } } ] diff --git a/datasets/1.0/bigcode-the-stack/metadata.json b/datasets/1.0/bigcode-the-stack/metadata.json index 13c9a8744..c690ad550 100644 --- a/datasets/1.0/bigcode-the-stack/metadata.json +++ b/datasets/1.0/bigcode-the-stack/metadata.json @@ -66,6 +66,7 @@ ], "license": "other", "sameAs": "https://www.bigcode-project.org/docs/about/the-stack/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/bigcode/the-stack", "distribution": [ { diff --git a/datasets/1.0/coco2014-mini/metadata.json b/datasets/1.0/coco2014-mini/metadata.json index 2898ad897..47c7a32e6 100644 --- a/datasets/1.0/coco2014-mini/metadata.json +++ b/datasets/1.0/coco2014-mini/metadata.json @@ -50,6 +50,7 @@ "description": "Smaller downloadable version of COCO to be used in unit tests.", "citeAs": "None", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/coco2014/metadata.json b/datasets/1.0/coco2014/metadata.json index 4a5a8f831..479674959 100644 --- a/datasets/1.0/coco2014/metadata.json +++ b/datasets/1.0/coco2014/metadata.json @@ -50,6 +50,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citeAs": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/datasets/1.0/fashion-mnist/metadata.json b/datasets/1.0/fashion-mnist/metadata.json index ecee238a2..c327ad2b8 100644 --- a/datasets/1.0/fashion-mnist/metadata.json +++ b/datasets/1.0/fashion-mnist/metadata.json @@ -49,6 +49,7 @@ "description": "Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of\n60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image,\nassociated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in\nreplacement for the original MNIST dataset for benchmarking machine learning algorithms.\nIt shares the same image size and structure of training and testing splits.\n", "citeAs": "@article{DBLP:journals/corr/abs-1708-07747,\n author = {Han Xiao and\n Kashif Rasul and\n Roland Vollgraf},\n title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning\n Algorithms},\n journal = {CoRR},\n volume = {abs/1708.07747},\n year = {2017},\n url = {http://arxiv.org/abs/1708.07747},\n archivePrefix = {arXiv},\n eprint = {1708.07747},\n timestamp = {Mon, 13 Aug 2018 16:47:27 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07747},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "license": "mit", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/fashion_mnist", "distribution": [ { diff --git a/datasets/1.0/flores-200/metadata.json b/datasets/1.0/flores-200/metadata.json index 3549f2247..64c6bde5e 100644 --- a/datasets/1.0/flores-200/metadata.json +++ b/datasets/1.0/flores-200/metadata.json @@ -53,6 +53,7 @@ "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}" ], "license": "cc-by-sa-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/facebookresearch/flores", "version": "0.0.1", "distribution": [ diff --git a/datasets/1.0/gpt-3/metadata.json b/datasets/1.0/gpt-3/metadata.json index 898c8584c..df03dc2f1 100644 --- a/datasets/1.0/gpt-3/metadata.json +++ b/datasets/1.0/gpt-3/metadata.json @@ -49,6 +49,7 @@ "description": "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions \u2013 something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", "citeAs": "@article{brown2020language, title={Language Models are Few-Shot Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei}, year={2020}, eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }", "license": "unknown", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/openai/gpt-3", "distribution": [ { diff --git a/datasets/1.0/huggingface-c4/metadata.json b/datasets/1.0/huggingface-c4/metadata.json index 703ea4cf1..e5e7ce420 100644 --- a/datasets/1.0/huggingface-c4/metadata.json +++ b/datasets/1.0/huggingface-c4/metadata.json @@ -49,6 +49,7 @@ "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "citeAs": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", "license": "odc-by", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/c4", "version": "0.0.0", "distribution": [ diff --git a/datasets/1.0/huggingface-mnist/metadata.json b/datasets/1.0/huggingface-mnist/metadata.json index f02cad51f..ce6a532c6 100644 --- a/datasets/1.0/huggingface-mnist/metadata.json +++ b/datasets/1.0/huggingface-mnist/metadata.json @@ -49,6 +49,7 @@ "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citeAs": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", "license": "mit", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/mnist", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/movielens/metadata.json b/datasets/1.0/movielens/metadata.json index 07e2c665c..b9b9f2284 100644 --- a/datasets/1.0/movielens/metadata.json +++ b/datasets/1.0/movielens/metadata.json @@ -47,6 +47,7 @@ "name": "Movielens-25M", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags. Released 12/2019", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://grouplens.org/datasets/movielens/25m/", "distribution": [ { diff --git a/datasets/1.0/pass-mini/metadata.json b/datasets/1.0/pass-mini/metadata.json index e7f08c726..d90bd6642 100755 --- a/datasets/1.0/pass-mini/metadata.json +++ b/datasets/1.0/pass-mini/metadata.json @@ -49,6 +49,7 @@ "description": "Smaller downloadable version of PASS to be used in unit tests.", "citeAs": "None", "license": "None", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/1.0/pass/metadata.json b/datasets/1.0/pass/metadata.json index c315656c1..71c78fc4f 100755 --- a/datasets/1.0/pass/metadata.json +++ b/datasets/1.0/pass/metadata.json @@ -49,6 +49,7 @@ "description": "PASS is a large-scale image dataset that does not include any humans and which can be used for high-quality pretraining while significantly reducing privacy concerns.", "citeAs": "@Article{asano21pass, author = \"Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi\", title = \"PASS: An ImageNet replacement for self-supervised pretraining without humans\", journal = \"NeurIPS Track on Datasets and Benchmarks\", year = \"2021\" }", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.robots.ox.ac.uk/~vgg/data/pass/", "distribution": [ { diff --git a/datasets/1.0/recipes/compressed_archive.json b/datasets/1.0/recipes/compressed_archive.json index c30cb3f88..93ab6d778 100644 --- a/datasets/1.0/recipes/compressed_archive.json +++ b/datasets/1.0/recipes/compressed_archive.json @@ -47,6 +47,7 @@ "name": "compressed_archive_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a fairly minimal example, showing a way to describe archive files.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/recipes/compressed_archive/about", "distribution": [ { diff --git a/datasets/1.0/recipes/enum.json b/datasets/1.0/recipes/enum.json index 98f585364..10034c067 100644 --- a/datasets/1.0/recipes/enum.json +++ b/datasets/1.0/recipes/enum.json @@ -47,6 +47,7 @@ "name": "enum_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a fairly minimal example, showing a way to describe enumerations.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/enum/about", "distribution": [ { diff --git a/datasets/1.0/recipes/file_object_in_zip.json b/datasets/1.0/recipes/file_object_in_zip.json index b9985200a..de51e5b12 100644 --- a/datasets/1.0/recipes/file_object_in_zip.json +++ b/datasets/1.0/recipes/file_object_in_zip.json @@ -47,6 +47,7 @@ "name": "file_object_in_zip", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Minimal example to read a FileObject contained in a zip.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/minimal.json b/datasets/1.0/recipes/minimal.json index 356e3f075..968378c19 100644 --- a/datasets/1.0/recipes/minimal.json +++ b/datasets/1.0/recipes/minimal.json @@ -47,5 +47,6 @@ "name": "minimal_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a very minimal example, with only the required fields.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/minimal/about" } diff --git a/datasets/1.0/recipes/minimal_recommended.json b/datasets/1.0/recipes/minimal_recommended.json index 6d11ad5d9..3b6eabe7f 100644 --- a/datasets/1.0/recipes/minimal_recommended.json +++ b/datasets/1.0/recipes/minimal_recommended.json @@ -48,6 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a minimal example, including the required and the recommended fields.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/recipes/minimal-recommended", "distribution": [ { diff --git a/datasets/1.0/recipes/read_binary_file_by_line.json b/datasets/1.0/recipes/read_binary_file_by_line.json index 1cadf942a..a201624f3 100644 --- a/datasets/1.0/recipes/read_binary_file_by_line.json +++ b/datasets/1.0/recipes/read_binary_file_by_line.json @@ -47,6 +47,7 @@ "name": "read_binary_file_by_line", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a recipe illustrating how to read files line by line.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/read_from_directory.json b/datasets/1.0/recipes/read_from_directory.json index 448a5e36f..682885adb 100644 --- a/datasets/1.0/recipes/read_from_directory.json +++ b/datasets/1.0/recipes/read_from_directory.json @@ -48,6 +48,7 @@ "name": "read_from_directory", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Minimal example showing how to read from local directories.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/mlcommons/croissant", "distribution": [ { diff --git a/datasets/1.0/recipes/read_from_tar.json b/datasets/1.0/recipes/read_from_tar.json index c40591c6f..233286502 100644 --- a/datasets/1.0/recipes/read_from_tar.json +++ b/datasets/1.0/recipes/read_from_tar.json @@ -47,6 +47,7 @@ "name": "read_from_tar", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example dataset to read several FileSets from a tar.gz and join them.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/simple-split.json b/datasets/1.0/recipes/simple-split.json index 62c96d548..8671ff06f 100644 --- a/datasets/1.0/recipes/simple-split.json +++ b/datasets/1.0/recipes/simple-split.json @@ -49,6 +49,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "An artificial example dataset defining splits from a CSV column", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/simple-join/metadata.json b/datasets/1.0/simple-join/metadata.json index 82db4bb6a..a83163f34 100644 --- a/datasets/1.0/simple-join/metadata.json +++ b/datasets/1.0/simple-join/metadata.json @@ -48,6 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example to showcase the use of join.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/simple-parquet/metadata.json b/datasets/1.0/simple-parquet/metadata.json index ed33e886a..107203ee1 100644 --- a/datasets/1.0/simple-parquet/metadata.json +++ b/datasets/1.0/simple-parquet/metadata.json @@ -48,6 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example to read Parquet files.", "license": "https://creativecommons.org/licenses/by/4.0/", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/titanic/metadata.json b/datasets/1.0/titanic/metadata.json index 4e2a62a3c..9b190192b 100644 --- a/datasets/1.0/titanic/metadata.json +++ b/datasets/1.0/titanic/metadata.json @@ -50,6 +50,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citeAs": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/wiki-text/metadata.json b/datasets/1.0/wiki-text/metadata.json index c774b6809..75cbaeb38 100644 --- a/datasets/1.0/wiki-text/metadata.json +++ b/datasets/1.0/wiki-text/metadata.json @@ -50,6 +50,7 @@ "description": "The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\n\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.", "citeAs": "@article{merity2016pointer, title={Pointer sentinel mixture models}, author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, journal={arXiv preprint arXiv:1609.07843}, year={2016} }", "license": "cc-by-sa-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/", "distribution": [ { diff --git a/datasets/1.0/world-happiness/metadata.json b/datasets/1.0/world-happiness/metadata.json index 8d8a27993..693ba513a 100644 --- a/datasets/1.0/world-happiness/metadata.json +++ b/datasets/1.0/world-happiness/metadata.json @@ -49,6 +49,7 @@ "description": "Happiness scored according to economic production, social support, etc.", "citeAs": "None", "license": "cc0-1.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.kaggle.com/datasets/unsdsn/world-happiness", "distribution": [ { diff --git a/editor/core/state.py b/editor/core/state.py index 634d5f6bd..ae3cce156 100644 --- a/editor/core/state.py +++ b/editor/core/state.py @@ -198,6 +198,7 @@ class Metadata: date_published: datetime.datetime | None = None license: str | None = "" personal_sensitive_information: str | None = None + sd_license: str | None = "" uuid: str | None = None url: str = "" distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list) diff --git a/editor/cypress/fixtures/0.8/coco2014.json b/editor/cypress/fixtures/0.8/coco2014.json index 5aa1f6ebe..2007832a7 100644 --- a/editor/cypress/fixtures/0.8/coco2014.json +++ b/editor/cypress/fixtures/0.8/coco2014.json @@ -42,6 +42,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/editor/cypress/fixtures/0.8/titanic.json b/editor/cypress/fixtures/0.8/titanic.json index 4733fb231..3468faecd 100644 --- a/editor/cypress/fixtures/0.8/titanic.json +++ b/editor/cypress/fixtures/0.8/titanic.json @@ -42,6 +42,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/editor/cypress/fixtures/1.0/coco2014.json b/editor/cypress/fixtures/1.0/coco2014.json index 4a5a8f831..479674959 100644 --- a/editor/cypress/fixtures/1.0/coco2014.json +++ b/editor/cypress/fixtures/1.0/coco2014.json @@ -50,6 +50,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citeAs": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/editor/cypress/fixtures/1.0/titanic.json b/editor/cypress/fixtures/1.0/titanic.json index 4e2a62a3c..9b190192b 100644 --- a/editor/cypress/fixtures/1.0/titanic.json +++ b/editor/cypress/fixtures/1.0/titanic.json @@ -50,6 +50,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citeAs": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/editor/views/jsonld.py b/editor/views/jsonld.py index 603885815..8f3f829e1 100644 --- a/editor/views/jsonld.py +++ b/editor/views/jsonld.py @@ -49,6 +49,7 @@ def render_jsonld(): name=croissant.metadata.name, cite_as=croissant.metadata.cite_as, license=croissant.metadata.license, + sd_license=croissant.metadata.sd_license, description=croissant.metadata.description, url=croissant.metadata.url, distribution=distribution, diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py index 8768b8bab..32f058bfe 100644 --- a/python/mlcroissant/mlcroissant/_src/core/constants.py +++ b/python/mlcroissant/mlcroissant/_src/core/constants.py @@ -96,6 +96,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace: SCHEMA_ORG_NAME = namespace.SDO.name SCHEMA_ORG_PUBLISHER = namespace.SDO.publisher SCHEMA_ORG_SAME_AS = namespace.SDO.sameAs +SCHEMA_ORG_SD_LICENSE = namespace.SDO.sdLicense SCHEMA_ORG_SHA256 = namespace.SDO.sha256 SCHEMA_ORG_URL = namespace.SDO.url SCHEMA_ORG_VERSION = namespace.SDO.version @@ -143,6 +144,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace: SCHEMA_ORG_ENCODING_FORMAT: "encoding_format", SCHEMA_ORG_KEYWORDS: "keywords", SCHEMA_ORG_LICENSE: "license", + SCHEMA_ORG_SD_LICENSE: "sd_license", SCHEMA_ORG_MD5: "md5", SCHEMA_ORG_NAME: "name", SCHEMA_ORG_PUBLISHER: "publisher", diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py index 96234c55c..2b96b6bbb 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py @@ -106,6 +106,7 @@ class Metadata(Node): is_live_dataset: bool | None = None keywords: list[str] | None = None license: list[str] | None = None + sd_license: str | None = None name: str = "" publisher: list[PersonOrOrganization] | None = None same_as: list[str] | None = None @@ -145,12 +146,13 @@ def __post_init__(self): self.validate_name() self.version = self.validate_version() self.license = self.validate_license() + self.sd_license = self.validate_sd_license() self.date_created = self.validate_date(self.date_created) self.date_modified = self.validate_date(self.date_modified) self.date_published = self.validate_date(self.date_published) self.assert_has_mandatory_properties("name") self.assert_has_optional_properties( - "cite_as", "date_published", "license", "version" + "cite_as", "date_published", "license", "sd_license", "version" ) # Raise exception if there are errors. @@ -184,6 +186,7 @@ def to_json(self) -> Json: "license": unbox_singleton_list(self.license), "personalSensitiveInformation": self.personal_sensitive_information, "publisher": PersonOrOrganization.to_json(self.publisher), + "sdLicense": self.sd_license, "url": self.url, "sameAs": unbox_singleton_list(self.same_as), "version": self.version, @@ -265,6 +268,17 @@ def validate_license(self) -> list[str] | None: self.add_error(f"License should be a list of str. Got: {license}") return None + def validate_sd_license(self) -> str | None: + """Validates the sdLicense as a string.""" + sd_license = self.sd_license + if sd_license is None: + return None + elif isinstance(sd_license, str): + return sd_license + else: + self.add_error(f"sdLicense should be a str. Got: {sd_license}") + return None + def validate_date(self, date: Any) -> datetime.datetime | None: """Validates dates as a datetime for any input.""" if date is None: @@ -381,6 +395,7 @@ def from_jsonld(cls, ctx: Context, metadata: Json) -> Metadata: publisher=publisher, record_sets=record_sets, same_as=box_singleton_list(metadata.get(constants.SCHEMA_ORG_SAME_AS)), + sd_license=metadata.get(constants.SCHEMA_ORG_SD_LICENSE), uuid=uuid_from_jsonld(metadata), url=url, version=metadata.get(constants.SCHEMA_ORG_VERSION), diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata_test.py index 360cf53bf..5ff4a8270 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata_test.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata_test.py @@ -28,7 +28,7 @@ def test_checks_are_performed(): create_test_node(Metadata, name="field_name") mandatory_mock.assert_called_once_with("name") optional_mock.assert_called_once_with( - "cite_as", "date_published", "license", "version" + "cite_as", "date_published", "license", "sd_license", "version" ) validate_name_mock.assert_called_once() @@ -48,6 +48,7 @@ def test_from_jsonld(conforms_to: CroissantVersion): # ...or dates can be datetime.dates. constants.SCHEMA_ORG_DATE_PUBLISHED: datetime.date(1990, 2, 3), constants.SCHEMA_ORG_LICENSE: "License", + constants.SCHEMA_ORG_SD_LICENSE: "SD_License", constants.SCHEMA_ORG_URL: "https://mlcommons.org", constants.SCHEMA_ORG_VERSION: "1.0.0", constants.ML_COMMONS_IS_LIVE_DATASET(ctx): False, @@ -68,6 +69,7 @@ def test_from_jsonld(conforms_to: CroissantVersion): assert metadata.license == ["License"] assert metadata.is_live_dataset == False assert metadata.personal_sensitive_information == "personal_sensitive_information" + assert metadata.sd_license == "SD_License" assert metadata.url == "https://mlcommons.org" assert metadata.version == "1.0.0" assert not ctx.issues.errors diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json index 8ad618e62..73ccc7f84 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json index 239e358a6..43014951a 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json index 337daa30d..2e85d5000 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json index 14bc17b4b..05c395e90 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json index 95cc7a2b7..fc225b1a2 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json index 39553a409..f6e313627 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json @@ -43,6 +43,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json index 489fa1400..281b26082 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json index 2aa780d0e..6e3446e8e 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json index 5e01d753c..e2278e7e5 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json index c501f4302..d7147662f 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json @@ -43,6 +43,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_bad_type/metadata.json index 097869860..342559626 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_bad_type/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "This is a sdLicense.", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json index 8660222c7..6d04dacf2 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json @@ -40,6 +40,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json index c33cb66cc..caef30d72 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json index ee047f2a1..ca3bf6639 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json @@ -44,6 +44,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json index 6827b3ef0..5f4ed0f3b 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json index 14354001f..75a5d0f67 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json index cc4a43592..813daa3eb 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json index 5486f689a..d78dbda47 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json index e2124618d..113373e78 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json index d076e56c4..ceec6344d 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json @@ -50,6 +50,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json index 8a227db97..6c72c4696 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json @@ -47,10 +47,11 @@ "name": "mydataset", "description": "This is a description.", "conformsTo": "http://mlcommons.org/croissant/1.0", - "datePublished": "1990-02-01", "@language": "en", "citeAs": "This is a citation.", + "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json index b1046c3cd..a9bf7d7ca 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json index 68eebcb56..46f79474a 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json index 40f376ee6..ffd37ec7a 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json @@ -50,6 +50,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_bad_type/metadata.json index ea68c8f1b..e9e60fd10 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_bad_type/metadata.json @@ -48,6 +48,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "This is a sdLicense.", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json index 8f3be9ec1..8b2dcfebc 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json @@ -47,6 +47,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json index a0048f4fe..03ce19612 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json index 6902e9073..f6c0a4ac0 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json @@ -51,6 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py b/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py new file mode 100644 index 000000000..f75e7ca09 --- /dev/null +++ b/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py @@ -0,0 +1,7 @@ +"""Migration: add sdLicense.""" + + +def up(json_ld): + """Up migration to add sdLicense to all included datasets.""" + json_ld["sdLicense"] = "https://www.apache.org/licenses/LICENSE-2.0" + return json_ld