From 9a3796c8f5234f6a1f8307d517511083090e872f Mon Sep 17 00:00:00 2001 From: niklub Date: Wed, 29 Jan 2020 20:11:07 +0300 Subject: [PATCH] Input data fixes (#200) var fixes with input data stream etc --- label_studio/project.py | 513 +++++++----------- label_studio/server.py | 170 +----- label_studio/tasks.py | 103 ++++ label_studio/templates/import_help.html | 2 +- label_studio/utils/analytics.py | 11 +- label_studio/utils/argparser.py | 108 ++++ label_studio/utils/io.py | 21 + label_studio/utils/misc.py | 21 - label_studio/utils/schema/default_config.json | 23 + setup.py | 2 +- 10 files changed, 485 insertions(+), 489 deletions(-) create mode 100644 label_studio/tasks.py create mode 100644 label_studio/utils/argparser.py create mode 100644 label_studio/utils/schema/default_config.json diff --git a/label_studio/project.py b/label_studio/project.py index a008d76b1ed..d32fa7d5cad 100644 --- a/label_studio/project.py +++ b/label_studio/project.py @@ -2,23 +2,23 @@ import io import logging import json -import urllib -import orjson import random from shutil import copy2 -from collections import OrderedDict, defaultdict +from collections import defaultdict from datetime import datetime from operator import itemgetter +from xml.etree import ElementTree +from uuid import uuid4 from label_studio_converter import Converter -from label_studio.utils.misc import LabelConfigParser, config_line_stripped, config_comments_free, parse_config +from label_studio.utils.misc import config_line_stripped, config_comments_free, parse_config from label_studio.utils.analytics import Analytics from label_studio.utils.models import ProjectObj, MLBackend from label_studio.utils.exceptions import ValidationError -from label_studio.utils.io import find_file, delete_dir_content - +from label_studio.utils.io import find_file, delete_dir_content, json_load +from label_studio.tasks import Tasks logger = logging.getLogger(__name__) @@ -37,21 +37,76 @@ def __init__(self, config, name, context=None): self.config = config self.name = name + self.on_boarding = {} + self.context = context or {} + self.tasks = None + self.load_tasks() + + self.label_config_line, self.label_config_full, self.input_data_tags = None, None, None + self.load_label_config() + + self.derived_input_schema, self.derived_output_schema = None, None + self.load_derived_schemas() + + self.analytics = None + self.load_analytics() + + self.project_obj, self.ml_backend = None, None + self.load_project_ml_backend() + + self.converter = None + self.load_converter() + + def load_tasks(self): + self.tasks = json_load(self.config['input_path']) + self.tasks = {int(k): v for k, v in self.tasks.items()} + print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path']) + + def load_label_config(self): + self.label_config_full = config_comments_free(open(self.config['label_config']).read()) + self.label_config_line = config_line_stripped(self.label_config_full) + self.input_data_tags = self.get_input_data_tags(self.label_config_line) + + def load_derived_schemas(self): + num_tasks_loaded = len(self.tasks) self.derived_input_schema = [] self.derived_output_schema = { 'from_name_to_name_type': set(), 'labels': defaultdict(set) } - self.label_config_line = None - self.label_config_full = None - self.ml_backend = None - self.project_obj = None - self.analytics = None - self.converter = None - self.on_boarding = {} - self.context = context or {} - self.reload() + if num_tasks_loaded > 0: + for tag in self.input_data_tags: + self.derived_input_schema.append({ + 'type': tag.tag, + 'value': tag.attrib['value'].lstrip('$') + }) + + # for all already completed tasks we update derived output schema for further label config validation + for task_id in self.get_task_ids(): + task_with_completions = self.get_task_with_completions(task_id) + if task_with_completions and 'completions' in task_with_completions: + completions = task_with_completions['completions'] + for completion in completions: + self._update_derived_output_schema(completion) + + def load_analytics(self): + collect_analytics = os.getenv('collect_analytics') + if collect_analytics is None: + collect_analytics = self.config.get('collect_analytics', True) + self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context) + + def load_project_ml_backend(self): + # configure project + self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full) + # configure machine learning backend + ml_backend_params = self.config.get('ml_backend') + if ml_backend_params: + self.ml_backend = MLBackend.from_params(ml_backend_params) + self.project_obj.connect(self.ml_backend) + + def load_converter(self): + self.converter = Converter(self.label_config_full) @property def id(self): @@ -72,6 +127,7 @@ def validate_label_config(self, config_string): self.project_obj.validate_label_config(config_string) parsed_config = parse_config(config_string) + self.validate_label_config_on_derived_input_schema(parsed_config) self.validate_label_config_on_derived_output_schema(parsed_config) @@ -81,13 +137,21 @@ def update_label_config(self, new_label_config): with io.open(label_config_file, mode='w') as f: f.write(new_label_config) + # reload everything that depends on label config + self.load_label_config() + self.load_derived_schemas() + self.load_analytics() + self.load_project_ml_backend() + self.load_converter() + # save project config state self.config['label_config_updated'] = True with io.open(self.config['config_path'], mode='w') as f: json.dump(self.config, f) logger.info('Label config saved to: {path}'.format(path=label_config_file)) - def _get_single_input_value(self, input_data_tags): + @classmethod + def _get_single_input_value(cls, input_data_tags): if len(input_data_tags) > 1: val = ",".join(tag.attrib.get("name") for tag in input_data_tags) print('Warning! Multiple input data tags found: ' + @@ -96,38 +160,6 @@ def _get_single_input_value(self, input_data_tags): data_key = input_data_tag.attrib.get('value').lstrip('$') return data_key - def _create_task_with_local_uri(self, filepath, data_key, task_id): - """ Convert filepath to task with flask serving URL - """ - filename = os.path.basename(self, filepath) - params = urllib.parse.urlencode({'d': os.path.dirname(filepath)}) - base_url = 'http://localhost:{port}/'.format(port=self.config.get("port")) - image_url_path = base_url + urllib.parse.quote('data/' + filename) - image_local_url = '{image_url_path}?{params}'.format(image_url_path=image_url_path, params=params) - return { - 'id': task_id, - 'task_path': filepath, - 'data': {data_key: image_local_url} - } - - def is_text_annotation(self, input_data_tags, filepath): - return ( - len(input_data_tags) == 1 and input_data_tags[0].tag == 'Text' - and filepath.endswith(self._allowed_extensions['Text']) - ) - - def is_image_annotation(self, input_data_tags, filepath): - return ( - len(input_data_tags) == 1 and input_data_tags[0].tag == 'Image' - and filepath.lower().endswith(self._allowed_extensions['Image']) - ) - - def is_audio_annotation(self, input_data_tags, filepath): - return ( - len(input_data_tags) == 1 and input_data_tags[0].tag in ('Audio', 'AudioPlus') - and filepath.lower().endswith(self._allowed_extensions['Audio']) - ) - def _update_derived_output_schema(self, completion): """ Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type) @@ -139,7 +171,7 @@ def _update_derived_output_schema(self, completion): self.derived_output_schema['from_name_to_name_type'].add(( result['from_name'], result['to_name'], result['type'] )) - for label in result['value'][result['type']]: + for label in result['value'].get(result['type'], []): self.derived_output_schema['labels'][result['from_name']].add(label) def validate_label_config_on_derived_input_schema(self, config_string_or_parsed_config): @@ -225,111 +257,6 @@ def validate_label_config_on_derived_output_schema(self, config_string_or_parsed .format(from_name=from_name, extra_labels=extra_labels) ) - def tasks_from_json_file(self, path): - """ Prepare tasks from json - - :param path: path to json with list or dict - :param tasks: main db instance of tasks - :return: new task id - """ - def push_task(root): - task_id = len(self.tasks) + 1 - data = root['data'] if 'data' in root else root - self.tasks[task_id] = {'id': task_id, 'task_path': path, 'data': data} - if 'predictions' in data: - self.tasks[task_id]['predictions'] = data['predictions'] - self.tasks[task_id]['data'].pop('predictions', None) - if 'predictions' in root: - self.tasks[task_id]['predictions'] = root['predictions'] - - logger.debug('Reading tasks from JSON file ' + path) - with open(path) as f: - json_body = orjson.loads(f.read()) - - # multiple tasks in file - if isinstance(json_body, list): - [push_task(data) for data in json_body] - - # one task in file - elif isinstance(json_body, dict): - push_task(json_body) - - # unsupported task type - else: - raise Exception('Unsupported task data:', path) - - def _init(self): - label_config = LabelConfigParser(self.config['label_config']) - - if not os.path.exists(self.config['output_dir']): - os.mkdir(self.config['output_dir']) - - task_id = 0 - data_key = None - - input_data_tags = label_config.get_input_data_tags() - - # load at first start - self.tasks = OrderedDict() - - # file - if os.path.isfile(self.config['input_path']): - files = [os.path.basename(self.config['input_path'])] - root_dir = os.path.normpath(os.path.dirname(self.config['input_path'])) - - # directory - else: - root_dir = os.path.normpath(self.config['input_path']) - files = [os.path.join(root, f) for root, _, files in os.walk(root_dir) for f in files \ - if 'completion' not in f and 'completion' not in root] - - # walk over all the files - for f in files: - norm_f = os.path.normpath(f) - path = os.path.join(root_dir, norm_f) if not norm_f.startswith(root_dir) else f - - # load tasks from json - if f.endswith('.json'): - self.tasks_from_json_file(path) - - # load tasks from txt: line by line, task by task - elif self.is_text_annotation(input_data_tags, f): - if data_key is None: - data_key = self._get_single_input_value(input_data_tags) - with io.open(path) as fin: - for line in fin: - task_id = len(self.tasks) + 1 - self.tasks[task_id] = {'id': task_id, 'task_path': path, 'data': {data_key: line.strip()}} - - # load tasks from files: creating URI to local resources - elif self.is_image_annotation(input_data_tags, f) or self.is_audio_annotation(input_data_tags, f): - if data_key is None: - data_key = self._get_single_input_value(input_data_tags) - task_id = len(self.tasks) + 1 - self.tasks[task_id] = self._create_task_with_local_uri(f, data_key, task_id) - else: - logger.warning('Unrecognized file format for file ' + f) - - num_tasks_loaded = len(self.tasks) - - # make derived input schema - if num_tasks_loaded > 0: - for tag in input_data_tags: - self.derived_input_schema.append({ - 'type': tag.tag, - 'value': tag.attrib['value'].lstrip('$') - }) - - # for all already completed tasks we update derived output schema for further label config validation - for task_id in self.get_task_ids(): - task_with_completions = self.get_task_with_completions(task_id) - if task_with_completions and 'completions' in task_with_completions: - completions = task_with_completions['completions'] - for completion in completions: - self._update_derived_output_schema(completion) - - print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path']) - def get_tasks(self): """ Load tasks from JSON files in input_path directory @@ -343,9 +270,13 @@ def delete_tasks(self): :return: """ delete_dir_content(self.config['output_dir']) - with io.open(self.config['input_path'], mode='w') as f: - json.dump([], f) - self.reload() + if os.path.exists(self.config['input_path']) and os.path.isfile(self.config['input_path']): + with io.open(self.config['input_path'], mode='w') as f: + json.dump({}, f) + + # reload everything related to tasks + self.load_tasks() + self.load_derived_schemas() def iter_tasks(self): sampling = self.config.get('sampling', 'sequential') @@ -468,43 +399,40 @@ def delete_completion(self, task_id): filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') os.remove(filename) - def reload(self): - self.tasks = None - self.derived_input_schema = [] - self.derived_output_schema = { - 'from_name_to_name_type': set(), - 'labels': defaultdict(set) - } - - self._init() - - self.label_config_full = config_comments_free(open(self.config['label_config']).read()) - self.label_config_line = config_line_stripped(self.label_config_full) - - collect_analytics = os.getenv('collect_analytics') - if collect_analytics is None: - collect_analytics = self.config.get('collect_analytics', True) - if self.analytics is None: - self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context) - else: - self.analytics.update_info(self.label_config_line, collect_analytics, self.name, self.context) - - # configure project - self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full) - - # configure machine learning backend - if self.ml_backend is None: - ml_backend_params = self.config.get('ml_backend') - if ml_backend_params: - ml_backend = MLBackend.from_params(ml_backend_params) - self.project_obj.connect(ml_backend) - - self.converter = Converter(self.label_config_full) - @classmethod def get_project_dir(cls, project_name, args): return os.path.join(args.root_dir, project_name) + @classmethod + def get_input_data_tags(cls, label_config): + tag_iter = ElementTree.fromstring(label_config).iter() + return [ + tag for tag in tag_iter + if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$') + ] + + @classmethod + def _load_tasks(cls, input_path, args, label_config_file): + with io.open(label_config_file) as f: + label_config = f.read() + + task_loader = Tasks() + if args.input_format == 'json': + return task_loader.from_json_file(input_path) + if args.input_format == 'json-dir': + return task_loader.from_dir_with_json_files(input_path) + input_data_tags = cls.get_input_data_tags(label_config) + data_key = Project._get_single_input_value(input_data_tags) + if args.input_format == 'text': + return task_loader.from_text_file(input_path, data_key) + if args.input_format == 'text-dir': + return task_loader.from_dir_with_text_files(input_path, data_key) + if args.input_format == 'image-dir': + return task_loader.from_dir_with_image_files(input_path, data_key) + if args.input_format == 'audio-dir': + return task_loader.from_dir_with_audio_files(input_path, data_key) + raise RuntimeError('Can\'t load tasks for input format={}'.format(args.input_format)) + @classmethod def create_project_dir(cls, project_name, args): """ @@ -517,81 +445,74 @@ def create_project_dir(cls, project_name, args): """ dir = cls.get_project_dir(project_name, args) os.makedirs(dir, exist_ok=True) - label_config_name = 'config.xml' - output_dir_name = 'completions' - input_path_name = 'tasks.json' - default_config_file = os.path.join(dir, 'config.json') - default_label_config_file = os.path.join(dir, label_config_name) - default_output_dir = os.path.join(dir, output_dir_name) - default_input_path = os.path.join(dir, input_path_name) - - if hasattr(args, 'config_path') and args.config_path: - copy2(args.config_path, default_config_file) - if hasattr(args, 'input_path') and args.input_path: - copy2(args.input_path, default_input_path) - if hasattr(args, 'output_dir') and args.output_dir: - if os.path.exists(args.output_dir): - copy2(args.output_dir, default_output_dir) - if hasattr(args, 'label_config') and args.label_config: - copy2(args.label_config, default_label_config_file) - - default_config = { - 'title': 'Label Studio', - 'port': 8200, - 'debug': False, - - 'label_config': label_config_name, - 'input_path': input_path_name, - 'output_dir': output_dir_name, - - 'instruction': 'Type some hypertext for label experts!', - 'allow_delete_completions': True, - 'templates_dir': 'examples', - - 'editor': { - 'debug': False - }, - - '!ml_backend': { - 'url': 'http://localhost:9090', - 'model_name': 'my_super_model' - }, - 'sampling': 'uniform' - } - - # create input_path (tasks.json) - if not os.path.exists(default_input_path): - with io.open(default_input_path, mode='w') as fout: - json.dump([], fout, indent=2) - print(default_input_path + ' input path has been created.') - else: - print(default_input_path + ' input path already exists.') - # create config file (config.json) - if not os.path.exists(default_config_file): - with io.open(default_config_file, mode='w') as fout: - json.dump(default_config, fout, indent=2) - print(default_config_file + ' config file has been created.') - else: - print(default_config_file + ' config file already exists.') + config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json')) - # create label config (config.xml) - if not os.path.exists(default_label_config_file): - path = find_file('examples/image_polygons/config.xml') - default_label_config = open(path).read() + def already_exists_error(what, path): + raise RuntimeError('{path} {what} already exists. Use "--force" option to recreate it.'.format( + path=path, what=what + )) - with io.open(default_label_config_file, mode='w') as fout: - fout.write(default_label_config) - print(default_label_config_file + ' label config file has been created.') + # save label config + config_xml = 'config.xml' + config_xml_path = os.path.join(dir, config_xml) + label_config_file = args.label_config or config.get('label_config') + if label_config_file: + copy2(label_config_file, config_xml_path) + print(label_config_file + ' label config copied to ' + config_xml_path) else: - print(default_label_config_file + ' label config file already exists.') - - # create output dir (completions) - if not os.path.exists(default_output_dir): - os.makedirs(default_output_dir) - print(default_output_dir + ' output directory has been created.') + if os.path.exists(config_xml_path) and not args.force: + already_exists_error('label config', config_xml_path) + default_label_config = find_file('examples/image_polygons/config.xml') + copy2(default_label_config, config_xml_path) + print(default_label_config + ' label config copied to ' + config_xml_path) + config['label_config'] = config_xml + + # save tasks.json + tasks_json = 'tasks.json' + tasks_json_path = os.path.join(dir, tasks_json) + input_path = args.input_path or config.get('input_path') + if input_path: + tasks = cls._load_tasks(input_path, args, config_xml_path) + with io.open(tasks_json_path, mode='w') as fout: + json.dump(tasks, fout, indent=2) + print(tasks_json_path + ' input path has been created from ' + input_path) + else: + if os.path.exists(tasks_json_path) and not args.force: + already_exists_error('input path', tasks_json_path) + with io.open(tasks_json_path, mode='w') as fout: + json.dump({}, fout) + print(tasks_json_path + ' input path has been created with empty tasks.') + config['input_path'] = tasks_json + + # create completions dir + completions_dir = os.path.join(dir, 'completions') + if os.path.exists(completions_dir) and not args.force: + already_exists_error('output dir', completions_dir) + if os.path.exists(completions_dir): + delete_dir_content(completions_dir) + print(completions_dir + ' output dir already exists. Clear it.') else: - print(default_output_dir + ' output directory already exists.') + os.makedirs(completions_dir, exist_ok=True) + print(completions_dir + ' output dir has been created.') + config['output_dir'] = 'completions' + + if args.ml_backend_url: + if 'ml_backend' not in config or not isinstance(config['ml_backend'], dict): + config['ml_backend'] = {} + config['ml_backend']['url'] = args.ml_backend_url + if args.ml_backend_name: + config['ml_backend']['name'] = args.ml_backend_name + else: + config['ml_backend']['name'] = str(uuid4()) + + # create config.json + config_json = 'config.json' + config_json_path = os.path.join(dir, config_json) + if os.path.exists(config_json_path) and not args.force: + already_exists_error('config', config_json_path) + with io.open(config_json_path, mode='w') as f: + json.dump(config, f, indent=2) print('') print('Label Studio has been successfully initialized. Check project states in ' + dir) @@ -601,67 +522,34 @@ def create_project_dir(cls, project_name, args): @classmethod def _get_config(cls, project_dir, args): """ - Get config path from input args Namespace acquired by Argparser - :param args: + Get config from input args Namespace acquired by Argparser :param args: :return: """ - # if config is explicitly specified, just return it - if args.config_path: - config_path = args.config_path - else: - # check if project directory exists - if not os.path.exists(project_dir): - raise FileNotFoundError( - 'Couldn\'t find directory ' + project_dir + - ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + - args.project_name + ' --init' - ) - - # check config.json exists in directory - config_path = os.path.join(project_dir, 'config.json') - if not os.path.exists(config_path): - raise FileNotFoundError( - 'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir + - ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init' - ) + # check if project directory exists + if not os.path.exists(project_dir): + raise FileNotFoundError( + 'Couldn\'t find directory ' + project_dir + + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + + args.project_name + ' --init' + ) + + # check config.json exists in directory + config_path = os.path.join(project_dir, 'config.json') + if not os.path.exists(config_path): + raise FileNotFoundError( + 'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir + + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init' + ) config_path = os.path.abspath(config_path) with io.open(config_path) as c: config = json.load(c) - if args.port: - config['port'] = args.port - - if args.label_config: - config['label_config'] = args.label_config - - if args.input_path: - config['input_path'] = args.input_path - - if args.output_dir: - config['output_dir'] = args.output_dir - - if args.debug is not None: - config['debug'] = args.debug - - if args.ml_backend_url: - if 'ml_backend' not in config: - config['ml_backend'] = {} - config['ml_backend']['url'] = args.ml_backend_url - - if args.ml_backend_name: - if 'ml_backend' not in config: - config['ml_backend'] = {} - config['ml_backend']['name'] = args.ml_backend_name - - # absolutize paths relative to config.json - config_dir = os.path.dirname(config_path) - config['label_config'] = os.path.join(config_dir, config['label_config']) - config['input_path'] = os.path.join(config_dir, config['input_path']) - config['output_dir'] = os.path.join(config_dir, config['output_dir']) config['config_path'] = config_path - + config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path']) + config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config']) + config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir']) return config @classmethod @@ -681,6 +569,7 @@ def get(cls, project_name, args, context): if os.path.exists(project_dir): project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project + return project raise KeyError('Project {p} doesn\'t exist'.format(p=project_name)) diff --git a/label_studio/server.py b/label_studio/server.py index 3ab05b63ce6..37a8dd9cac4 100644 --- a/label_studio/server.py +++ b/label_studio/server.py @@ -26,11 +26,11 @@ from label_studio.utils.validation import TaskValidator from label_studio.utils.exceptions import ValidationError from label_studio.utils.functions import generate_sample_task_without_check, data_examples -from label_studio.utils.misc import ( - exception_treatment, log_config, log, config_line_stripped, - get_config_templates, iter_config_templates -) +from label_studio.utils.misc import exception_treatment, log_config, log, config_line_stripped, get_config_templates +from label_studio.utils.argparser import parse_input_args + from label_studio.project import Project +from label_studio.tasks import Tasks logger = logging.getLogger(__name__) @@ -208,6 +208,7 @@ def import_page(): project = project_get_or_create() project.analytics.send(getframeinfo(currentframe()).function) + project.project_obj.name = project.name return flask.render_template( 'import.html', config=project.config, @@ -302,7 +303,6 @@ def api_save_config(): return make_response(jsonify({'label_config': [str(e)]}), status.HTTP_400_BAD_REQUEST) project.update_label_config(label_config) - project.reload() project.analytics.send(getframeinfo(currentframe()).function) return Response(status=status.HTTP_201_CREATED) @@ -401,30 +401,19 @@ class DjangoRequest: except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) - # save task file to input dir - if os.path.isdir(project.config['input_path']): - # tasks are in directory, write a new file with tasks - task_dir = project.config['input_path'] - now = datetime.now() - data = json.dumps(new_tasks, ensure_ascii=False) - md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest() - name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8]) - path = os.path.join(task_dir, name + '.json') - tasks = new_tasks - else: - # tasks are all in one file, append it - path = project.config['input_path'] - old_tasks = json.load(open(path)) - assert isinstance(old_tasks, list), 'Tasks from input_path must be list' - tasks = old_tasks + new_tasks - logger.error("It's recommended to use directory as input_path: " + - project.config['input_path'] + ' -> ' + os.path.dirname(project.config['input_path'])) + # tasks are all in one file, append it + path = project.config['input_path'] + old_tasks = json.load(open(path)) + max_id_in_old_tasks = max(old_tasks.keys()) if old_tasks else -1 + new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1) + old_tasks.update(new_tasks) with open(path, 'w') as f: - json.dump(tasks, f, ensure_ascii=False, indent=4) + json.dump(old_tasks, f, ensure_ascii=False, indent=4) - # load new tasks - project.reload() + # load new tasks and everything related + project.load_tasks() + project.load_derived_schemas() duration = time.time() - start return make_response(jsonify({ @@ -627,138 +616,17 @@ def get_data_file(filename): return flask.send_from_directory(directory, filename, as_attachment=True) -def parse_input_args(): - """ Combine args with json config - - :return: config dict - """ - import sys - import argparse - - if len(sys.argv) == 1: - print('\nQuick start usage: label-studio start my_project --init\n') - - root_parser = argparse.ArgumentParser(add_help=False) - root_parser.add_argument( - '-b', '--no-browser', dest='no_browser', action='store_true', - help='Do not open browser at label studio start' - ) - root_parser.add_argument( - '-d', '--debug', dest='debug', action='store_true', - help='Debug mode for Flask', default=None - ) - root_parser.add_argument( - '--root-dir', dest='root_dir', default='.', - help='Projects root directory' - ) - root_parser.add_argument( - '-v', '--verbose', dest='verbose', action='store_true', - help='Increase output verbosity') - - parser = argparse.ArgumentParser(description='Label studio') - - subparsers = parser.add_subparsers(dest='command', help='Available commands') - subparsers.required = True - - # init sub-command parser - - available_templates = [os.path.basename(os.path.dirname(f)) for f in iter_config_templates()] - - parser_init = subparsers.add_parser('init', help='Initialize Label Studio', parents=[root_parser]) - parser_init.add_argument( - 'project_name', - help='Path to directory where project state will be initialized') - parser_init.add_argument( - '--template', dest='template', choices=available_templates, - help='Choose from predefined project templates' - ) - - # start sub-command parser - - parser_start = subparsers.add_parser('start', help='Start Label Studio server', parents=[root_parser]) - parser_start.add_argument( - 'project_name', - help='Path to directory where project state has been initialized' - ) - parser_start.add_argument( - '--init', dest='init', action='store_true', - help='Initialize if project is not initialized yet' - ) - parser_start.add_argument( - '--template', dest='template', choices=available_templates, - help='Choose from predefined project templates' - ) - parser_start.add_argument( - '-c', '--config', dest='config_path', - help='Server config') - parser_start.add_argument( - '-l', '--label-config', dest='label_config', default='', - help='Label config path') - parser_start.add_argument( - '-i', '--input-path', dest='input_path', default='', - help='Input path to task file or directory with tasks') - parser_start.add_argument( - '-o', '--output-dir', dest='output_dir', default='', - help='Output directory for completions') - parser_start.add_argument( - '-p', '--port', dest='port', default=8200, type=int, - help='Server port') - parser_start.add_argument( - '--ml-backend-url', dest='ml_backend_url', - help='Machine learning backend URL') - parser_start.add_argument( - '--ml-backend-name', dest='ml_backend_name', - help='Machine learning backend name') - - # start-multi-session sub-command parser - - parser_start_ms = subparsers.add_parser( - 'start-multi-session', help='Start Label Studio server', parents=[root_parser]) - parser_start_ms.add_argument( - '--template', dest='template', choices=available_templates, - help='Choose from predefined project templates' - ) - parser_start_ms.add_argument( - '-c', '--config', dest='config_path', - help='Server config') - parser_start_ms.add_argument( - '-l', '--label-config', dest='label_config', default='', - help='Label config path') - parser_start_ms.add_argument( - '-i', '--input-path', dest='input_path', default='', - help='Input path to task file or directory with tasks') - parser_start_ms.add_argument( - '-o', '--output-dir', dest='output_dir', default='', - help='Output directory for completions') - parser_start_ms.add_argument( - '-p', '--port', dest='port', default=8200, type=int, - help='Server port') - parser_start_ms.add_argument( - '--ml-backend-url', dest='ml_backend_url', - help='Machine learning backend URL') - parser_start_ms.add_argument( - '--ml-backend-name', dest='ml_backend_name', - help='Machine learning backend name') - - args = parser.parse_args() - label_config_explicitly_specified = hasattr(args, 'label_config') and args.label_config - if args.template and not label_config_explicitly_specified: - args.label_config = os.path.join(find_dir('examples'), args.template, 'config.xml') - if not hasattr(args, 'label_config'): - args.label_config = None - return args - - def main(): import threading import webbrowser - import label_studio.utils.functions - global input_args input_args = parse_input_args() + import label_studio.utils.functions + label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port) + # On `init` command, create directory args.project_name with initial project state and exit if input_args.command == 'init': Project.create_project_dir(input_args.project_name, input_args) @@ -770,8 +638,6 @@ def main(): if input_args.init: Project.create_project_dir(input_args.project_name, input_args) - label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port) - # On `start` command, launch browser if --no-browser is not specified and start label studio server if input_args.command == 'start': if not input_args.no_browser: diff --git a/label_studio/tasks.py b/label_studio/tasks.py new file mode 100644 index 00000000000..49ca5980707 --- /dev/null +++ b/label_studio/tasks.py @@ -0,0 +1,103 @@ +import orjson +import os +import io +import urllib + +from label_studio.utils.io import iter_files + + +class Tasks(object): + + _allowed_extensions = { + 'Text': ('.txt',), + 'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'), + 'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac') + } + + def _create_task_with_local_uri(self, filepath, data_key, task_id): + """ Convert filepath to task with flask serving URL + """ + from label_studio.utils.functions import HOSTNAME + + filename = os.path.basename(filepath) + params = urllib.parse.urlencode({'d': os.path.dirname(filepath)}) + base_url = HOSTNAME + '/' + image_url_path = base_url + urllib.parse.quote('data/' + filename) + image_local_url = '{image_url_path}?{params}'.format(image_url_path=image_url_path, params=params) + return { + 'id': task_id, + 'task_path': filepath, + 'data': {data_key: image_local_url} + } + + def from_dict(self, d, task_id=0): + task = {} + data = d['data'] if 'data' in d else d + task[task_id] = {'id': task_id, 'data': data} + if 'predictions' in data: + task[task_id]['predictions'] = data['predictions'] + task[task_id]['data'].pop('predictions', None) + if 'predictions' in d: + task[task_id]['predictions'] = d['predictions'] + return task + + def from_list_of_dicts(self, l, start_task_id=0): + tasks = {} + for i, t in enumerate(l): + tasks.update(self.from_dict(t, start_task_id + i)) + return tasks + + def from_json_file(self, path, start_task_id=0): + with open(path) as f: + json_body = orjson.loads(f.read()) + + # multiple tasks in file + if isinstance(json_body, list): + tasks = {} + task_id = start_task_id + for d in json_body: + tasks.update(self.from_dict(d, task_id)) + task_id += 1 + return tasks + + # one task in file + elif isinstance(json_body, dict): + tasks = self.from_dict(json_body, start_task_id) + return tasks + + # unsupported task type + else: + raise Exception('Unsupported task data:', path) + + def from_dir_with_json_files(self, path): + tasks = {} + for f in iter_files(path, ext='.json'): + tasks.update(self.from_json_file(f, start_task_id=len(tasks))) + return tasks + + def from_text_file(self, path, data_key, start_task_id=0): + tasks = {} + task_id = start_task_id + with io.open(path) as f: + for line in f: + tasks[task_id] = {'id': task_id, 'data': {data_key: line.strip()}} + return tasks + + def from_dir_with_text_files(self, path, data_key): + tasks = {} + for f in iter_files(path, ext=''): + tasks.update(self.from_text_file(f, data_key, start_task_id=len(tasks))) + return tasks + + def _from_dir_with_local_resources(self, path, data_key, data_type): + tasks = {} + for f in iter_files(path, ext=self._allowed_extensions[data_type]): + task_id = len(tasks) + 1 + tasks[task_id] = self._create_task_with_local_uri(f, data_key, task_id) + return tasks + + def from_dir_with_image_files(self, path, data_key): + return self._from_dir_with_local_resources(path, data_key, 'Image') + + def from_dir_with_audio_files(self, path, data_key): + return self._from_dir_with_local_resources(path, data_key, 'Audio') diff --git a/label_studio/templates/import_help.html b/label_studio/templates/import_help.html index a14558ce1ca..eb514cdc2e7 100644 --- a/label_studio/templates/import_help.html +++ b/label_studio/templates/import_help.html @@ -3,7 +3,7 @@
Import formats and examples
You can use open source datasets and build tasks in - + Heartex.Datasets .

diff --git a/label_studio/utils/analytics.py b/label_studio/utils/analytics.py index b22207bc573..9d397af8134 100644 --- a/label_studio/utils/analytics.py +++ b/label_studio/utils/analytics.py @@ -2,10 +2,13 @@ import os import io import requests +import calendar +from datetime import datetime from mixpanel import Mixpanel, MixpanelException from copy import deepcopy from operator import itemgetter + from uuid import uuid4 from .misc import get_app_version, parse_config, convert_string_to_hash from .io import get_config_dir @@ -63,6 +66,9 @@ def _get_label_types(self): }) return label_types + def _get_timestamp_now(self): + return calendar.timegm(datetime.now().timetuple()) + def update_info(self, label_config_line, collect_analytics=True, project_name='', context=None): if label_config_line != self._label_config_line: self._label_types = self._get_label_types() @@ -86,9 +92,10 @@ def send(self, event_name, **kwargs): json_data = data json_data['event'] = event_name - json_data['user_id'] = self._user_id + json_data['server_id'] = self._user_id + json_data['server_time'] = self._get_timestamp_now() try: - url = 'https://analytics.labelstudio.io/prod' + url = 'https://analytics.labelstud.io/prod' logger.debug('Sending to {url}:\n{data}'.format(url=url, data=json_data)) requests.post(url=url, json=json_data) except requests.RequestException as exc: diff --git a/label_studio/utils/argparser.py b/label_studio/utils/argparser.py new file mode 100644 index 00000000000..af1a4747acf --- /dev/null +++ b/label_studio/utils/argparser.py @@ -0,0 +1,108 @@ +import os + +from label_studio.utils.io import find_dir +from label_studio.utils.misc import iter_config_templates + + +def parse_input_args(): + """ Combine args with json config + + :return: config dict + """ + import sys + import argparse + + if len(sys.argv) == 1: + print('\nQuick start usage: label-studio start my_project --init\n') + + available_templates = [os.path.basename(os.path.dirname(f)) for f in iter_config_templates()] + + def valid_filepath(filepath): + path = os.path.abspath(os.path.expanduser(filepath)) + if os.path.exists(path): + return path + raise FileNotFoundError(filepath) + + root_parser = argparse.ArgumentParser(add_help=False) + root_parser.add_argument( + '-b', '--no-browser', dest='no_browser', action='store_true', + help='Do not open browser at label studio start') + root_parser.add_argument( + '-d', '--debug', dest='debug', action='store_true', + help='Debug mode for Flask', default=None) + root_parser.add_argument( + '--force', dest='force', action='store_true', + help='Force creation new resources if exist') + root_parser.add_argument( + '--root-dir', dest='root_dir', default='.', + help='Projects root directory') + root_parser.add_argument( + '-v', '--verbose', dest='verbose', action='store_true', + help='Increase output verbosity') + root_parser.add_argument( + '--template', dest='template', choices=available_templates, + help='Choose from predefined project templates') + root_parser.add_argument( + '-c', '--config', dest='config_path', type=valid_filepath, + help='Server config') + root_parser.add_argument( + '-l', '--label-config', dest='label_config', type=valid_filepath, + help='Label config path') + root_parser.add_argument( + '-i', '--input-path', dest='input_path', type=valid_filepath, + help='Input path to task file or directory with tasks') + root_parser.add_argument( + '--input-format', dest='input_format', + choices=('json', 'json-dir', 'text', 'text-dir', 'image-dir', 'audio-dir'), default='json', + help='Input path to task file or directory with tasks') + root_parser.add_argument( + '-o', '--output-dir', dest='output_dir', type=valid_filepath, + help='Output directory for completions') + root_parser.add_argument( + '--ml-backend-url', dest='ml_backend_url', + help='Machine learning backend URL') + root_parser.add_argument( + '--ml-backend-name', dest='ml_backend_name', + help='Machine learning backend name') + root_parser.add_argument( + '-p', '--port', dest='port', default=8200, type=int, + help='Server port') + + parser = argparse.ArgumentParser(description='Label studio') + + subparsers = parser.add_subparsers(dest='command', help='Available commands') + subparsers.required = True + + # init sub-command parser + + parser_init = subparsers.add_parser('init', help='Initialize Label Studio', parents=[root_parser]) + parser_init.add_argument( + 'project_name', + help='Path to directory where project state will be initialized') + + # start sub-command parser + + parser_start = subparsers.add_parser('start', help='Start Label Studio server', parents=[root_parser]) + parser_start.add_argument( + 'project_name', + help='Path to directory where project state has been initialized') + parser_start.add_argument( + '--init', dest='init', action='store_true', + help='Initialize if project is not initialized yet') + + # start-multi-session sub-command parser + + parser_start_ms = subparsers.add_parser( + 'start-multi-session', help='Start Label Studio server', parents=[root_parser]) + + args = parser.parse_args() + if args.output_dir is not None: + raise RuntimeError('"--output-dir" option is deprecated and has no effect.\n' + 'All output results are saved to project_name/completions directory') + + label_config_explicitly_specified = hasattr(args, 'label_config') and args.label_config + if args.template and not label_config_explicitly_specified: + args.label_config = os.path.join(find_dir('examples'), args.template, 'config.xml') + if not hasattr(args, 'label_config'): + args.label_config = None + return args diff --git a/label_studio/utils/io.py b/label_studio/utils/io.py index 4d3f3ee3d3e..3bd02ef9a02 100644 --- a/label_studio/utils/io.py +++ b/label_studio/utils/io.py @@ -2,6 +2,8 @@ import pkg_resources import shutil import glob +import io +import json from contextlib import contextmanager from tempfile import mkstemp, mkdtemp @@ -84,3 +86,22 @@ def get_data_dir(): def delete_dir_content(dirpath): for f in glob.glob(dirpath + '/*'): os.remove(f) + + +def remove_file_or_dir(path): + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + shutil.rmtree(path) + + +def iter_files(root_dir, ext): + for root, _, files in os.walk(root_dir): + for f in files: + if f.lower().endswith(ext): + yield os.path.join(root, f) + + +def json_load(file): + with io.open(file) as f: + return json.load(f) diff --git a/label_studio/utils/misc.py b/label_studio/utils/misc.py index 7ddb0a43f9b..469461a70c0 100644 --- a/label_studio/utils/misc.py +++ b/label_studio/utils/misc.py @@ -138,27 +138,6 @@ def config_comments_free(xml_config): return xml_config -class LabelConfigParser(object): - - def __init__(self, filepath): - with io.open(filepath) as f: - self._config = f.read() - - def get_value_for_name(self, name): - tag_iter = ElementTree.fromstring(self._config).iter() - return next(( - tag.attrib.get('value') for tag in tag_iter - if tag.attrib.get('name') == name), None - ) - - def get_input_data_tags(self): - tag_iter = ElementTree.fromstring(self._config).iter() - return [ - tag for tag in tag_iter - if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$') - ] - - def get_app_version(): return pkg_resources.get_distribution('label-studio').version diff --git a/label_studio/utils/schema/default_config.json b/label_studio/utils/schema/default_config.json new file mode 100644 index 00000000000..2a0de94d421 --- /dev/null +++ b/label_studio/utils/schema/default_config.json @@ -0,0 +1,23 @@ +{ + "title": "Label Studio", + "port": 8200, + "debug": false, + + "label_config": null, + "input_path": null, + "output_dir": "completions", + + "instruction": "Type some hypertext for label experts!", + "allow_delete_completions": true, + "templates_dir": "examples", + + "editor": { + "debug": false + }, + + "!ml_backend": { + "url": "http://localhost:9090", + "model_name": "my_super_model" + }, + "sampling": "uniform" +} \ No newline at end of file diff --git a/setup.py b/setup.py index 4de6f28e9e6..318746efd28 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ import setuptools # Package version -version = '0.4.4.post2' +version = '0.4.5' # Readme with open('README.md', 'r') as f: