diff --git a/README.md b/README.md index 9ee1291..13ae275 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # GeoCombine - ![CI](https://github.com/OpenGeoMetadata/GeoCombine/actions/workflows/ruby.yml/badge.svg) +![CI](https://github.com/OpenGeoMetadata/GeoCombine/actions/workflows/ruby.yml/badge.svg) | [![Coverage Status](https://img.shields.io/badge/coverage-95%25-brightgreen)]() | [![Gem Version](https://img.shields.io/gem/v/geo_combine.svg)](https://github.com/OpenGeoMetadata/GeoCombine/releases) - A Ruby toolkit for managing geospatial metadata, including: + - tasks for cloning, updating, and indexing OpenGeoMetdata metadata - library for converting metadata between standards @@ -43,6 +43,32 @@ Or install it yourself as: > iso_metadata.to_html ``` +### Migrating metadata + +You can use the `GeoCombine::Migrators` to migrate metadata from one schema to another. + +Currently, the only migrator is `GeoCombine::Migrators::V1AardvarkMigrator` which migrates from the [GeoBlacklight v1 schema](https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/gbl-1.0.md) to the [Aardvark schema](https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark.md) + +```ruby +# Load a record in geoblacklight v1 schema +record = JSON.parse(File.read('.spec/fixtures/docs/full_geoblacklight.json')) + +# Migrate it to Aardvark schema +GeoCombine::Migrators::V1AardvarkMigrator.new(v1_hash: record).run +``` + +Some fields cannot be migrated automatically. To handle the migration of collection names to IDs when migrating from v1 to Aardvark, you can provide a mapping of collection names to IDs to the migrator: + +```ruby +# You can store this mapping as a JSON or CSV file and load it into a hash +id_map = { + 'My Collection 1' => 'institution:my-collection-1', + 'My Collection 2' => 'institution:my-collection-2' +} + +GeoCombine::Migrators::V1AardvarkMigrator.new(v1_hash: record, collection_id_map: id_map).run +``` + ### OpenGeoMetadata #### Clone OpenGeoMetadata repositories locally @@ -63,7 +89,7 @@ You can also specify a single repository: $ bundle exec rake geocombine:clone[edu.stanford.purl] ``` -*Note: If you are using zsh, you will need to use escape characters in front of the brackets:* +_Note: If you are using zsh, you will need to use escape characters in front of the brackets:_ ```sh $ bundle exec rake geocombine:clone\[edu.stanford.purl\] @@ -83,7 +109,7 @@ You can also specify a single repository: $ bundle exec rake geocombine:pull[edu.stanford.purl] ``` -*Note: If you are using zsh, you will need to use escape characters in front of the brackets:* +_Note: If you are using zsh, you will need to use escape characters in front of the brackets:_ ```sh $ bundle exec rake geocombine:pull\[edu.stanford.purl\] diff --git a/lib/geo_combine/migrators/v1_aardvark_migrator.rb b/lib/geo_combine/migrators/v1_aardvark_migrator.rb index 2f07a0e..62c3e3f 100644 --- a/lib/geo_combine/migrators/v1_aardvark_migrator.rb +++ b/lib/geo_combine/migrators/v1_aardvark_migrator.rb @@ -1,41 +1,86 @@ # frozen_string_literal: true +require 'active_support' + module GeoCombine module Migrators - # TODO: WARNING! This class is not fully implemented and should not be used in - # production. See https://github.com/OpenGeoMetadata/GeoCombine/issues/121 - # for remaining work. - # # migrates the v1 schema to the aardvark schema class V1AardvarkMigrator attr_reader :v1_hash # @param v1_hash [Hash] parsed json in the v1 schema - def initialize(v1_hash:) + # @param collection_id_map [Hash] a hash mapping collection names to ids for converting dct_isPartOf_sm + def initialize(v1_hash:, collection_id_map: {}) @v1_hash = v1_hash + @v2_hash = v1_hash + @collection_id_map = collection_id_map end def run - v2_hash = convert_keys - v2_hash['gbl_mdVersion_s'] = 'Aardvark' - v2_hash + # Return unchanged if already in the aardvark schema + return @v2_hash if @v2_hash['gbl_mdVersion_s'] == 'Aardvark' + + # Convert the record + convert_keys + convert_single_to_multi_valued_fields + convert_non_crosswalked_fields + remove_deprecated_fields + + # Mark the record as converted and return it + @v2_hash['gbl_mdVersion_s'] = 'Aardvark' + @v2_hash end + # Namespace and URI changes to fields def convert_keys - v1_hash.transform_keys do |k| + @v2_hash.transform_keys! do |k| SCHEMA_FIELD_MAP[k] || k end end + # Fields that need to be converted from single to multi-valued + def convert_single_to_multi_valued_fields + @v2_hash = @v2_hash.each_with_object({}) do |(k, v), h| + h[k] = if !v.is_a?(Array) && k.match?(/.*_[s|i]m/) + [v] + else + v + end + end + end + + # Convert non-crosswalked fields via lookup tables + def convert_non_crosswalked_fields + # Keys may or may not include whitespace, so we normalize them. + # Resource class is required so we default to "Other"; resource type is not required. + @v2_hash['gbl_resourceClass_sm'] = RESOURCE_CLASS_MAP[@v1_hash['dc_type_s']&.gsub(/\s+/, '')] || ['Other'] + resource_type = RESOURCE_TYPE_MAP[@v1_hash['layer_geom_type_s']&.gsub(/\s+/, '')] + @v2_hash['gbl_resourceType_sm'] = resource_type unless resource_type.nil? + + # If the user specified a collection id map, use it to convert the collection names to ids + is_part_of = @v1_hash['dct_isPartOf_sm']&.map { |name| @collection_id_map[name] }&.compact + if is_part_of.present? + @v2_hash['dct_isPartOf_sm'] = is_part_of + else + @v2_hash.delete('dct_isPartOf_sm') + end + end + + # Remove fields that are no longer used + def remove_deprecated_fields + @v2_hash = @v2_hash.except(*SCHEMA_FIELD_MAP.keys, 'dc_type_s', 'layer_geom_type_s') + end + SCHEMA_FIELD_MAP = { 'dc_title_s' => 'dct_title_s', # new namespace 'dc_description_s' => 'dct_description_sm', # new namespace; single to multi-valued 'dc_language_s' => 'dct_language_sm', # new namespace; single to multi-valued - 'dc_language_sm' => 'dct_language_sm', # new namespace; single to multi-valued + 'dc_language_sm' => 'dct_language_sm', # new namespace 'dc_creator_sm' => 'dct_creator_sm', # new namespace 'dc_publisher_s' => 'dct_publisher_sm', # new namespace; single to multi-valued 'dct_provenance_s' => 'schema_provider_s', # new URI name 'dc_subject_sm' => 'dct_subject_sm', # new namespace + 'solr_geom' => 'dcat_bbox', # new URI name 'solr_year_i' => 'gbl_indexYear_im', # new URI name; single to multi-valued 'dc_source_sm' => 'dct_source_sm', # new namespace 'dc_rights_s' => 'dct_accessRights_s', # new URI name @@ -47,6 +92,27 @@ def convert_keys 'geoblacklight_version' => 'gbl_mdVersion_s', # new URI name 'suppressed_b' => 'gbl_suppressed_b' # new namespace }.freeze + + # Map Dublin Core types to Aardvark resource class sets + # See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-class.md + RESOURCE_CLASS_MAP = { + 'Collection' => ['Collections'], + 'Dataset' => ['Datasets'], + 'Image' => ['Imagery'], + 'InteractiveResource' => ['Websites'], + 'Service' => ['Web services'], + 'StillImage' => ['Imagery'] + }.freeze + + # Map geometry types to Aardvark resource type sets + # See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-type.md + RESOURCE_TYPE_MAP = { + 'Point' => ['Point data'], + 'Line' => ['Line data'], + 'Polygon' => ['Polygon data'], + 'Raster' => ['Raster data'], + 'Table' => ['Table data'] + }.freeze end end end diff --git a/spec/fixtures/docs/full_geoblacklight.json b/spec/fixtures/docs/full_geoblacklight.json index e58e835..73998b7 100644 --- a/spec/fixtures/docs/full_geoblacklight.json +++ b/spec/fixtures/docs/full_geoblacklight.json @@ -28,6 +28,13 @@ "dct_spatial_sm":[ "Uganda" ], + "dct_isPartOf_sm":[ + "Uganda GIS Maps and Data, 2000-2010" + ], + "dc_source_sm": [ + "stanford-rb371kw9607" + ], "solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)", - "solr_year_i":2005 + "solr_year_i":2005, + "suppressed_b":false } diff --git a/spec/fixtures/docs/full_geoblacklight_aardvark.json b/spec/fixtures/docs/full_geoblacklight_aardvark.json index 00debf4..bc3f837 100644 --- a/spec/fixtures/docs/full_geoblacklight_aardvark.json +++ b/spec/fixtures/docs/full_geoblacklight_aardvark.json @@ -1,19 +1,31 @@ { "gbl_mdVersion_s":"Aardvark", - "dct_identifier_sm":"http://purl.stanford.edu/cz128vq0535", + "dct_identifier_sm":[ + "http://purl.stanford.edu/cz128vq0535" + ], "dct_title_s":"2005 Rural Poverty GIS Database: Uganda", - "dct_description_sm":"This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production.", + "dct_description_sm":[ + "This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production." + ], "dct_accessRights_s":"Public", "schema_provider_s":"Stanford", "dct_references_s":"{\"http://schema.org/url\":\"http://purl.stanford.edu/cz128vq0535\",\"http://schema.org/downloadUrl\":\"http://stacks.stanford.edu/file/druid:cz128vq0535/data.zip\",\"http://www.loc.gov/mods/v3\":\"http://purl.stanford.edu/cz128vq0535.mods\",\"http://www.isotc211.org/schemas/2005/gmd/\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/iso19139.xml\",\"http://www.w3.org/1999/xhtml\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/default.html\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://geowebservices.stanford.edu/geoserver/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://geowebservices.stanford.edu/geoserver/wms\"}", "gbl_wxsIdentifier_s":"druid:cz128vq0535", "id":"stanford-cz128vq0535", - "layer_geom_type_s":"Polygon", + "gbl_resourceType_sm": [ + "Polygon data" + ], "gbl_mdModified_dt":"2015-01-13T18:46:38Z", "dct_format_s":"Shapefile", - "dct_language_sm":"English", - "dc_type_s":"Dataset", - "dct_publisher_sm":"Uganda Bureau of Statistics", + "dct_language_sm":[ + "English" + ], + "gbl_resourceClass_sm":[ + "Datasets" + ], + "dct_publisher_sm":[ + "Uganda Bureau of Statistics" + ], "dct_creator_sm":[ "Uganda Bureau of Statistics" ], @@ -28,6 +40,12 @@ "dct_spatial_sm":[ "Uganda" ], - "solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)", - "gbl_indexYear_im":2005 + "dct_source_sm": [ + "stanford-rb371kw9607" + ], + "dcat_bbox":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)", + "gbl_indexYear_im":[ + 2005 + ], + "gbl_suppressed_b":false } diff --git a/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb b/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb index ed81929..e6a8eee 100644 --- a/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb +++ b/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb @@ -6,17 +6,41 @@ include JsonDocs describe '#run' do - it 'migrates keys' do + it 'migrates fields to new names and types' do input_hash = JSON.parse(full_geoblacklight) - # TODO: Note that this fixture has not yet been fully converted to - # aardvark. See https://github.com/OpenGeoMetadata/GeoCombine/issues/121 - # for remaining work. expected_output = JSON.parse(full_geoblacklight_aardvark) expect(described_class.new(v1_hash: input_hash).run).to eq(expected_output) end + it 'removes deprecated fields' do + input_hash = JSON.parse(full_geoblacklight) + output = described_class.new(v1_hash: input_hash).run + expect(output.keys).not_to include(described_class::SCHEMA_FIELD_MAP.keys) + expect(output.keys).not_to include('dc_type_s') + expect(output.keys).not_to include('layer_geom_type_s') + end + + it 'leaves custom fields unchanged' do + input_hash = JSON.parse(full_geoblacklight) + input_hash['custom_field'] = 'custom_value' + output = described_class.new(v1_hash: input_hash).run + expect(output['custom_field']).to eq('custom_value') + end + context 'when the given record is already in aardvark schema' do - xit 'returns the record unchanged' + it 'returns the record unchanged' do + input_hash = JSON.parse(full_geoblacklight_aardvark) + expect(described_class.new(v1_hash: input_hash).run).to eq(input_hash) + end + end + + context 'when the user supplies a mapping for collection names to ids' do + it 'converts the collection names to ids' do + input_hash = JSON.parse(full_geoblacklight) + collection_id_map = { 'Uganda GIS Maps and Data, 2000-2010' => 'stanford-rb371kw9607' } + output = described_class.new(v1_hash: input_hash, collection_id_map: collection_id_map).run + expect(output['dct_isPartOf_sm']).to eq(['stanford-rb371kw9607']) + end end end end