From 9a3796c8f5234f6a1f8307d517511083090e872f Mon Sep 17 00:00:00 2001
From: niklub <lubimov.nicolas@gmail.com>
Date: Wed, 29 Jan 2020 20:11:07 +0300
Subject: [PATCH] Input data fixes (#200)

var fixes with input data stream etc
---
 label_studio/project.py                       | 513 +++++++-----------
 label_studio/server.py                        | 170 +-----
 label_studio/tasks.py                         | 103 ++++
 label_studio/templates/import_help.html       |   2 +-
 label_studio/utils/analytics.py               |  11 +-
 label_studio/utils/argparser.py               | 108 ++++
 label_studio/utils/io.py                      |  21 +
 label_studio/utils/misc.py                    |  21 -
 label_studio/utils/schema/default_config.json |  23 +
 setup.py                                      |   2 +-
 10 files changed, 485 insertions(+), 489 deletions(-)
 create mode 100644 label_studio/tasks.py
 create mode 100644 label_studio/utils/argparser.py
 create mode 100644 label_studio/utils/schema/default_config.json

diff --git a/label_studio/project.py b/label_studio/project.py
index a008d76b1ed..d32fa7d5cad 100644
--- a/label_studio/project.py
+++ b/label_studio/project.py
@@ -2,23 +2,23 @@
 import io
 import logging
 import json
-import urllib
-import orjson
 import random
 
 from shutil import copy2
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from datetime import datetime
 from operator import itemgetter
+from xml.etree import ElementTree
+from uuid import uuid4
 
 from label_studio_converter import Converter
 
-from label_studio.utils.misc import LabelConfigParser, config_line_stripped, config_comments_free, parse_config
+from label_studio.utils.misc import config_line_stripped, config_comments_free, parse_config
 from label_studio.utils.analytics import Analytics
 from label_studio.utils.models import ProjectObj, MLBackend
 from label_studio.utils.exceptions import ValidationError
-from label_studio.utils.io import find_file, delete_dir_content
-
+from label_studio.utils.io import find_file, delete_dir_content, json_load
+from label_studio.tasks import Tasks
 
 logger = logging.getLogger(__name__)
 
@@ -37,21 +37,76 @@ def __init__(self, config, name, context=None):
         self.config = config
         self.name = name
 
+        self.on_boarding = {}
+        self.context = context or {}
+
         self.tasks = None
+        self.load_tasks()
+
+        self.label_config_line, self.label_config_full, self.input_data_tags = None, None, None
+        self.load_label_config()
+
+        self.derived_input_schema, self.derived_output_schema = None, None
+        self.load_derived_schemas()
+
+        self.analytics = None
+        self.load_analytics()
+
+        self.project_obj, self.ml_backend = None, None
+        self.load_project_ml_backend()
+
+        self.converter = None
+        self.load_converter()
+
+    def load_tasks(self):
+        self.tasks = json_load(self.config['input_path'])
+        self.tasks = {int(k): v for k, v in self.tasks.items()}
+        print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path'])
+
+    def load_label_config(self):
+        self.label_config_full = config_comments_free(open(self.config['label_config']).read())
+        self.label_config_line = config_line_stripped(self.label_config_full)
+        self.input_data_tags = self.get_input_data_tags(self.label_config_line)
+
+    def load_derived_schemas(self):
+        num_tasks_loaded = len(self.tasks)
         self.derived_input_schema = []
         self.derived_output_schema = {
             'from_name_to_name_type': set(),
             'labels': defaultdict(set)
         }
-        self.label_config_line = None
-        self.label_config_full = None
-        self.ml_backend = None
-        self.project_obj = None
-        self.analytics = None
-        self.converter = None
-        self.on_boarding = {}
-        self.context = context or {}
-        self.reload()
+        if num_tasks_loaded > 0:
+            for tag in self.input_data_tags:
+                self.derived_input_schema.append({
+                    'type': tag.tag,
+                    'value': tag.attrib['value'].lstrip('$')
+                })
+
+        # for all already completed tasks we update derived output schema for further label config validation
+        for task_id in self.get_task_ids():
+            task_with_completions = self.get_task_with_completions(task_id)
+            if task_with_completions and 'completions' in task_with_completions:
+                completions = task_with_completions['completions']
+                for completion in completions:
+                    self._update_derived_output_schema(completion)
+
+    def load_analytics(self):
+        collect_analytics = os.getenv('collect_analytics')
+        if collect_analytics is None:
+            collect_analytics = self.config.get('collect_analytics', True)
+        self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context)
+
+    def load_project_ml_backend(self):
+        # configure project
+        self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full)
+        # configure machine learning backend
+        ml_backend_params = self.config.get('ml_backend')
+        if ml_backend_params:
+            self.ml_backend = MLBackend.from_params(ml_backend_params)
+            self.project_obj.connect(self.ml_backend)
+
+    def load_converter(self):
+        self.converter = Converter(self.label_config_full)
 
     @property
     def id(self):
@@ -72,6 +127,7 @@ def validate_label_config(self, config_string):
         self.project_obj.validate_label_config(config_string)
 
         parsed_config = parse_config(config_string)
+
         self.validate_label_config_on_derived_input_schema(parsed_config)
         self.validate_label_config_on_derived_output_schema(parsed_config)
 
@@ -81,13 +137,21 @@ def update_label_config(self, new_label_config):
         with io.open(label_config_file, mode='w') as f:
             f.write(new_label_config)
 
+        # reload everything that depends on label config
+        self.load_label_config()
+        self.load_derived_schemas()
+        self.load_analytics()
+        self.load_project_ml_backend()
+        self.load_converter()
+
         # save project config state
         self.config['label_config_updated'] = True
         with io.open(self.config['config_path'], mode='w') as f:
             json.dump(self.config, f)
         logger.info('Label config saved to: {path}'.format(path=label_config_file))
 
-    def _get_single_input_value(self, input_data_tags):
+    @classmethod
+    def _get_single_input_value(cls, input_data_tags):
         if len(input_data_tags) > 1:
             val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
             print('Warning! Multiple input data tags found: ' +
@@ -96,38 +160,6 @@ def _get_single_input_value(self, input_data_tags):
         data_key = input_data_tag.attrib.get('value').lstrip('$')
         return data_key
 
-    def _create_task_with_local_uri(self, filepath, data_key, task_id):
-        """ Convert filepath to task with flask serving URL
-        """
-        filename = os.path.basename(self, filepath)
-        params = urllib.parse.urlencode({'d': os.path.dirname(filepath)})
-        base_url = 'http://localhost:{port}/'.format(port=self.config.get("port"))
-        image_url_path = base_url + urllib.parse.quote('data/' + filename)
-        image_local_url = '{image_url_path}?{params}'.format(image_url_path=image_url_path, params=params)
-        return {
-            'id': task_id,
-            'task_path': filepath,
-            'data': {data_key: image_local_url}
-        }
-
-    def is_text_annotation(self, input_data_tags, filepath):
-        return (
-            len(input_data_tags) == 1 and input_data_tags[0].tag == 'Text'
-            and filepath.endswith(self._allowed_extensions['Text'])
-        )
-
-    def is_image_annotation(self, input_data_tags, filepath):
-        return (
-            len(input_data_tags) == 1 and input_data_tags[0].tag == 'Image'
-            and filepath.lower().endswith(self._allowed_extensions['Image'])
-        )
-
-    def is_audio_annotation(self, input_data_tags, filepath):
-        return (
-            len(input_data_tags) == 1 and input_data_tags[0].tag in ('Audio', 'AudioPlus')
-            and filepath.lower().endswith(self._allowed_extensions['Audio'])
-        )
-
     def _update_derived_output_schema(self, completion):
         """
         Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
@@ -139,7 +171,7 @@ def _update_derived_output_schema(self, completion):
             self.derived_output_schema['from_name_to_name_type'].add((
                 result['from_name'], result['to_name'], result['type']
             ))
-            for label in result['value'][result['type']]:
+            for label in result['value'].get(result['type'], []):
                 self.derived_output_schema['labels'][result['from_name']].add(label)
 
     def validate_label_config_on_derived_input_schema(self, config_string_or_parsed_config):
@@ -225,111 +257,6 @@ def validate_label_config_on_derived_output_schema(self, config_string_or_parsed
                     .format(from_name=from_name, extra_labels=extra_labels)
                 )
 
-    def tasks_from_json_file(self, path):
-        """ Prepare tasks from json
-
-        :param path: path to json with list or dict
-        :param tasks: main db instance of tasks
-        :return: new task id
-        """
-        def push_task(root):
-            task_id = len(self.tasks) + 1
-            data = root['data'] if 'data' in root else root
-            self.tasks[task_id] = {'id': task_id, 'task_path': path, 'data': data}
-            if 'predictions' in data:
-                self.tasks[task_id]['predictions'] = data['predictions']
-                self.tasks[task_id]['data'].pop('predictions', None)
-            if 'predictions' in root:
-                self.tasks[task_id]['predictions'] = root['predictions']
-
-        logger.debug('Reading tasks from JSON file ' + path)
-        with open(path) as f:
-            json_body = orjson.loads(f.read())
-
-            # multiple tasks in file
-            if isinstance(json_body, list):
-                [push_task(data) for data in json_body]
-
-            # one task in file
-            elif isinstance(json_body, dict):
-                push_task(json_body)
-
-            # unsupported task type
-            else:
-                raise Exception('Unsupported task data:', path)
-
-    def _init(self):
-        label_config = LabelConfigParser(self.config['label_config'])
-
-        if not os.path.exists(self.config['output_dir']):
-            os.mkdir(self.config['output_dir'])
-
-        task_id = 0
-        data_key = None
-
-        input_data_tags = label_config.get_input_data_tags()
-
-        # load at first start
-        self.tasks = OrderedDict()
-
-        # file
-        if os.path.isfile(self.config['input_path']):
-            files = [os.path.basename(self.config['input_path'])]
-            root_dir = os.path.normpath(os.path.dirname(self.config['input_path']))
-
-        # directory
-        else:
-            root_dir = os.path.normpath(self.config['input_path'])
-            files = [os.path.join(root, f) for root, _, files in os.walk(root_dir) for f in files \
-                     if 'completion' not in f and 'completion' not in root]
-
-        # walk over all the files
-        for f in files:
-            norm_f = os.path.normpath(f)
-            path = os.path.join(root_dir, norm_f) if not norm_f.startswith(root_dir) else f
-
-            # load tasks from json
-            if f.endswith('.json'):
-                self.tasks_from_json_file(path)
-
-            # load tasks from txt: line by line, task by task
-            elif self.is_text_annotation(input_data_tags, f):
-                if data_key is None:
-                    data_key = self._get_single_input_value(input_data_tags)
-                with io.open(path) as fin:
-                    for line in fin:
-                        task_id = len(self.tasks) + 1
-                        self.tasks[task_id] = {'id': task_id, 'task_path': path, 'data': {data_key: line.strip()}}
-
-            # load tasks from files: creating URI to local resources
-            elif self.is_image_annotation(input_data_tags, f) or self.is_audio_annotation(input_data_tags, f):
-                if data_key is None:
-                    data_key = self._get_single_input_value(input_data_tags)
-                task_id = len(self.tasks) + 1
-                self.tasks[task_id] = self._create_task_with_local_uri(f, data_key, task_id)
-            else:
-                logger.warning('Unrecognized file format for file ' + f)
-
-        num_tasks_loaded = len(self.tasks)
-
-        # make derived input schema
-        if num_tasks_loaded > 0:
-            for tag in input_data_tags:
-                self.derived_input_schema.append({
-                    'type': tag.tag,
-                    'value': tag.attrib['value'].lstrip('$')
-                })
-
-        # for all already completed tasks we update derived output schema for further label config validation
-        for task_id in self.get_task_ids():
-            task_with_completions = self.get_task_with_completions(task_id)
-            if task_with_completions and 'completions' in task_with_completions:
-                completions = task_with_completions['completions']
-                for completion in completions:
-                    self._update_derived_output_schema(completion)
-
-        print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path'])
-
     def get_tasks(self):
         """ Load tasks from JSON files in input_path directory
 
@@ -343,9 +270,13 @@ def delete_tasks(self):
         :return:
         """
         delete_dir_content(self.config['output_dir'])
-        with io.open(self.config['input_path'], mode='w') as f:
-            json.dump([], f)
-        self.reload()
+        if os.path.exists(self.config['input_path']) and os.path.isfile(self.config['input_path']):
+            with io.open(self.config['input_path'], mode='w') as f:
+                json.dump({}, f)
+
+        # reload everything related to tasks
+        self.load_tasks()
+        self.load_derived_schemas()
 
     def iter_tasks(self):
         sampling = self.config.get('sampling', 'sequential')
@@ -468,43 +399,40 @@ def delete_completion(self, task_id):
         filename = os.path.join(self.config['output_dir'], str(task_id) + '.json')
         os.remove(filename)
 
-    def reload(self):
-        self.tasks = None
-        self.derived_input_schema = []
-        self.derived_output_schema = {
-            'from_name_to_name_type': set(),
-            'labels': defaultdict(set)
-        }
-
-        self._init()
-
-        self.label_config_full = config_comments_free(open(self.config['label_config']).read())
-        self.label_config_line = config_line_stripped(self.label_config_full)
-
-        collect_analytics = os.getenv('collect_analytics')
-        if collect_analytics is None:
-            collect_analytics = self.config.get('collect_analytics', True)
-        if self.analytics is None:
-            self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context)
-        else:
-            self.analytics.update_info(self.label_config_line, collect_analytics, self.name, self.context)
-
-        # configure project
-        self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full)
-
-        # configure machine learning backend
-        if self.ml_backend is None:
-            ml_backend_params = self.config.get('ml_backend')
-            if ml_backend_params:
-                ml_backend = MLBackend.from_params(ml_backend_params)
-                self.project_obj.connect(ml_backend)
-
-        self.converter = Converter(self.label_config_full)
-
     @classmethod
     def get_project_dir(cls, project_name, args):
         return os.path.join(args.root_dir, project_name)
 
+    @classmethod
+    def get_input_data_tags(cls, label_config):
+        tag_iter = ElementTree.fromstring(label_config).iter()
+        return [
+            tag for tag in tag_iter
+            if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$')
+        ]
+
+    @classmethod
+    def _load_tasks(cls, input_path, args, label_config_file):
+        with io.open(label_config_file) as f:
+            label_config = f.read()
+
+        task_loader = Tasks()
+        if args.input_format == 'json':
+            return task_loader.from_json_file(input_path)
+        if args.input_format == 'json-dir':
+            return task_loader.from_dir_with_json_files(input_path)
+        input_data_tags = cls.get_input_data_tags(label_config)
+        data_key = Project._get_single_input_value(input_data_tags)
+        if args.input_format == 'text':
+            return task_loader.from_text_file(input_path, data_key)
+        if args.input_format == 'text-dir':
+            return task_loader.from_dir_with_text_files(input_path, data_key)
+        if args.input_format == 'image-dir':
+            return task_loader.from_dir_with_image_files(input_path, data_key)
+        if args.input_format == 'audio-dir':
+            return task_loader.from_dir_with_audio_files(input_path, data_key)
+        raise RuntimeError('Can\'t load tasks for input format={}'.format(args.input_format))
+
     @classmethod
     def create_project_dir(cls, project_name, args):
         """
@@ -517,81 +445,74 @@ def create_project_dir(cls, project_name, args):
         """
         dir = cls.get_project_dir(project_name, args)
         os.makedirs(dir, exist_ok=True)
-        label_config_name = 'config.xml'
-        output_dir_name = 'completions'
-        input_path_name = 'tasks.json'
-        default_config_file = os.path.join(dir, 'config.json')
-        default_label_config_file = os.path.join(dir, label_config_name)
-        default_output_dir = os.path.join(dir, output_dir_name)
-        default_input_path = os.path.join(dir, input_path_name)
-
-        if hasattr(args, 'config_path') and args.config_path:
-            copy2(args.config_path, default_config_file)
-        if hasattr(args, 'input_path') and args.input_path:
-            copy2(args.input_path, default_input_path)
-        if hasattr(args, 'output_dir') and args.output_dir:
-            if os.path.exists(args.output_dir):
-                copy2(args.output_dir, default_output_dir)
-        if hasattr(args, 'label_config') and args.label_config:
-            copy2(args.label_config, default_label_config_file)
-
-        default_config = {
-            'title': 'Label Studio',
-            'port': 8200,
-            'debug': False,
-
-            'label_config': label_config_name,
-            'input_path': input_path_name,
-            'output_dir': output_dir_name,
-
-            'instruction': 'Type some <b>hypertext</b> for label experts!',
-            'allow_delete_completions': True,
-            'templates_dir': 'examples',
-
-            'editor': {
-                'debug': False
-            },
-
-            '!ml_backend': {
-                'url': 'http://localhost:9090',
-                'model_name': 'my_super_model'
-            },
-            'sampling': 'uniform'
-        }
-
-        # create input_path (tasks.json)
-        if not os.path.exists(default_input_path):
-            with io.open(default_input_path, mode='w') as fout:
-                json.dump([], fout, indent=2)
-            print(default_input_path + ' input path has been created.')
-        else:
-            print(default_input_path + ' input path already exists.')
 
-        # create config file (config.json)
-        if not os.path.exists(default_config_file):
-            with io.open(default_config_file, mode='w') as fout:
-                json.dump(default_config, fout, indent=2)
-            print(default_config_file + ' config file has been created.')
-        else:
-            print(default_config_file + ' config file already exists.')
+        config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json'))
 
-        # create label config (config.xml)
-        if not os.path.exists(default_label_config_file):
-            path = find_file('examples/image_polygons/config.xml')
-            default_label_config = open(path).read()
+        def already_exists_error(what, path):
+            raise RuntimeError('{path} {what} already exists. Use "--force" option to recreate it.'.format(
+                path=path, what=what
+            ))
 
-            with io.open(default_label_config_file, mode='w') as fout:
-                fout.write(default_label_config)
-            print(default_label_config_file + ' label config file has been created.')
+        # save label config
+        config_xml = 'config.xml'
+        config_xml_path = os.path.join(dir, config_xml)
+        label_config_file = args.label_config or config.get('label_config')
+        if label_config_file:
+            copy2(label_config_file, config_xml_path)
+            print(label_config_file + ' label config copied to ' + config_xml_path)
         else:
-            print(default_label_config_file + ' label config file already exists.')
-
-        # create output dir (completions)
-        if not os.path.exists(default_output_dir):
-            os.makedirs(default_output_dir)
-            print(default_output_dir + ' output directory has been created.')
+            if os.path.exists(config_xml_path) and not args.force:
+                already_exists_error('label config', config_xml_path)
+            default_label_config = find_file('examples/image_polygons/config.xml')
+            copy2(default_label_config, config_xml_path)
+            print(default_label_config + ' label config copied to ' + config_xml_path)
+        config['label_config'] = config_xml
+
+        # save tasks.json
+        tasks_json = 'tasks.json'
+        tasks_json_path = os.path.join(dir, tasks_json)
+        input_path = args.input_path or config.get('input_path')
+        if input_path:
+            tasks = cls._load_tasks(input_path, args, config_xml_path)
+            with io.open(tasks_json_path, mode='w') as fout:
+                json.dump(tasks, fout, indent=2)
+            print(tasks_json_path + ' input path has been created from ' + input_path)
+        else:
+            if os.path.exists(tasks_json_path) and not args.force:
+                already_exists_error('input path', tasks_json_path)
+            with io.open(tasks_json_path, mode='w') as fout:
+                json.dump({}, fout)
+            print(tasks_json_path + ' input path has been created with empty tasks.')
+        config['input_path'] = tasks_json
+
+        # create completions dir
+        completions_dir = os.path.join(dir, 'completions')
+        if os.path.exists(completions_dir) and not args.force:
+            already_exists_error('output dir', completions_dir)
+        if os.path.exists(completions_dir):
+            delete_dir_content(completions_dir)
+            print(completions_dir + ' output dir already exists. Clear it.')
         else:
-            print(default_output_dir + ' output directory already exists.')
+            os.makedirs(completions_dir, exist_ok=True)
+            print(completions_dir + ' output dir has been created.')
+        config['output_dir'] = 'completions'
+
+        if args.ml_backend_url:
+            if 'ml_backend' not in config or not isinstance(config['ml_backend'], dict):
+                config['ml_backend'] = {}
+            config['ml_backend']['url'] = args.ml_backend_url
+            if args.ml_backend_name:
+                config['ml_backend']['name'] = args.ml_backend_name
+            else:
+                config['ml_backend']['name'] = str(uuid4())
+
+        # create config.json
+        config_json = 'config.json'
+        config_json_path = os.path.join(dir, config_json)
+        if os.path.exists(config_json_path) and not args.force:
+            already_exists_error('config', config_json_path)
+        with io.open(config_json_path, mode='w') as f:
+            json.dump(config, f, indent=2)
 
         print('')
         print('Label Studio has been successfully initialized. Check project states in ' + dir)
@@ -601,67 +522,34 @@ def create_project_dir(cls, project_name, args):
     @classmethod
     def _get_config(cls, project_dir, args):
         """
-        Get config path from input args Namespace acquired by Argparser
-        :param args:
+        Get config from input args Namespace acquired by Argparser
         :param args:
         :return:
         """
-        # if config is explicitly specified, just return it
-        if args.config_path:
-            config_path = args.config_path
-        else:
-            # check if project directory exists
-            if not os.path.exists(project_dir):
-                raise FileNotFoundError(
-                    'Couldn\'t find directory ' + project_dir +
-                    ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' +
-                    args.project_name + ' --init'
-                )
-
-            # check config.json exists in directory
-            config_path = os.path.join(project_dir, 'config.json')
-            if not os.path.exists(config_path):
-                raise FileNotFoundError(
-                    'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir +
-                    ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init'
-                )
+        # check if project directory exists
+        if not os.path.exists(project_dir):
+            raise FileNotFoundError(
+                'Couldn\'t find directory ' + project_dir +
+                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' +
+                args.project_name + ' --init'
+            )
+
+        # check config.json exists in directory
+        config_path = os.path.join(project_dir, 'config.json')
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(
+                'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir +
+                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init'
+            )
 
         config_path = os.path.abspath(config_path)
         with io.open(config_path) as c:
             config = json.load(c)
 
-        if args.port:
-            config['port'] = args.port
-
-        if args.label_config:
-            config['label_config'] = args.label_config
-
-        if args.input_path:
-            config['input_path'] = args.input_path
-
-        if args.output_dir:
-            config['output_dir'] = args.output_dir
-
-        if args.debug is not None:
-            config['debug'] = args.debug
-
-        if args.ml_backend_url:
-            if 'ml_backend' not in config:
-                config['ml_backend'] = {}
-            config['ml_backend']['url'] = args.ml_backend_url
-
-        if args.ml_backend_name:
-            if 'ml_backend' not in config:
-                config['ml_backend'] = {}
-            config['ml_backend']['name'] = args.ml_backend_name
-
-        # absolutize paths relative to config.json
-        config_dir = os.path.dirname(config_path)
-        config['label_config'] = os.path.join(config_dir, config['label_config'])
-        config['input_path'] = os.path.join(config_dir, config['input_path'])
-        config['output_dir'] = os.path.join(config_dir, config['output_dir'])
         config['config_path'] = config_path
-
+        config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path'])
+        config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config'])
+        config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir'])
         return config
 
     @classmethod
@@ -681,6 +569,7 @@ def get(cls, project_name, args, context):
         if os.path.exists(project_dir):
             project = cls._load_from_dir(project_dir, project_name, args, context)
             cls._storage[project_name] = project
+            return project
 
         raise KeyError('Project {p} doesn\'t exist'.format(p=project_name))
 
diff --git a/label_studio/server.py b/label_studio/server.py
index 3ab05b63ce6..37a8dd9cac4 100644
--- a/label_studio/server.py
+++ b/label_studio/server.py
@@ -26,11 +26,11 @@
 from label_studio.utils.validation import TaskValidator
 from label_studio.utils.exceptions import ValidationError
 from label_studio.utils.functions import generate_sample_task_without_check, data_examples
-from label_studio.utils.misc import (
-    exception_treatment, log_config, log, config_line_stripped,
-    get_config_templates, iter_config_templates
-)
+from label_studio.utils.misc import exception_treatment, log_config, log, config_line_stripped, get_config_templates
+from label_studio.utils.argparser import parse_input_args
+
 from label_studio.project import Project
+from label_studio.tasks import Tasks
 
 logger = logging.getLogger(__name__)
 
@@ -208,6 +208,7 @@ def import_page():
     project = project_get_or_create()
 
     project.analytics.send(getframeinfo(currentframe()).function)
+    project.project_obj.name = project.name
     return flask.render_template(
         'import.html',
         config=project.config,
@@ -302,7 +303,6 @@ def api_save_config():
         return make_response(jsonify({'label_config': [str(e)]}), status.HTTP_400_BAD_REQUEST)
 
     project.update_label_config(label_config)
-    project.reload()
     project.analytics.send(getframeinfo(currentframe()).function)
     return Response(status=status.HTTP_201_CREATED)
 
@@ -401,30 +401,19 @@ class DjangoRequest:
     except ValidationError as e:
         return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST)
 
-    # save task file to input dir
-    if os.path.isdir(project.config['input_path']):
-        # tasks are in directory, write a new file with tasks
-        task_dir = project.config['input_path']
-        now = datetime.now()
-        data = json.dumps(new_tasks, ensure_ascii=False)
-        md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest()
-        name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8])
-        path = os.path.join(task_dir, name + '.json')
-        tasks = new_tasks
-    else:
-        # tasks are all in one file, append it
-        path = project.config['input_path']
-        old_tasks = json.load(open(path))
-        assert isinstance(old_tasks, list), 'Tasks from input_path must be list'
-        tasks = old_tasks + new_tasks
-        logger.error("It's recommended to use directory as input_path: " +
-                     project.config['input_path'] + ' -> ' + os.path.dirname(project.config['input_path']))
+    # tasks are all in one file, append it
+    path = project.config['input_path']
+    old_tasks = json.load(open(path))
+    max_id_in_old_tasks = max(old_tasks.keys()) if old_tasks else -1
+    new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1)
+    old_tasks.update(new_tasks)
 
     with open(path, 'w') as f:
-        json.dump(tasks, f, ensure_ascii=False, indent=4)
+        json.dump(old_tasks, f, ensure_ascii=False, indent=4)
 
-    # load new tasks
-    project.reload()
+    # load new tasks and everything related
+    project.load_tasks()
+    project.load_derived_schemas()
 
     duration = time.time() - start
     return make_response(jsonify({
@@ -627,138 +616,17 @@ def get_data_file(filename):
     return flask.send_from_directory(directory, filename, as_attachment=True)
 
 
-def parse_input_args():
-    """ Combine args with json config
-
-    :return: config dict
-    """
-    import sys
-    import argparse
-
-    if len(sys.argv) == 1:
-        print('\nQuick start usage: label-studio start my_project --init\n')
-
-    root_parser = argparse.ArgumentParser(add_help=False)
-    root_parser.add_argument(
-        '-b', '--no-browser', dest='no_browser', action='store_true',
-        help='Do not open browser at label studio start'
-    )
-    root_parser.add_argument(
-        '-d', '--debug', dest='debug', action='store_true',
-        help='Debug mode for Flask', default=None
-    )
-    root_parser.add_argument(
-        '--root-dir', dest='root_dir', default='.',
-        help='Projects root directory'
-    )
-    root_parser.add_argument(
-        '-v', '--verbose', dest='verbose', action='store_true',
-        help='Increase output verbosity')
-
-    parser = argparse.ArgumentParser(description='Label studio')
-
-    subparsers = parser.add_subparsers(dest='command', help='Available commands')
-    subparsers.required = True
-
-    # init sub-command parser
-
-    available_templates = [os.path.basename(os.path.dirname(f)) for f in iter_config_templates()]
-
-    parser_init = subparsers.add_parser('init', help='Initialize Label Studio', parents=[root_parser])
-    parser_init.add_argument(
-        'project_name',
-        help='Path to directory where project state will be initialized')
-    parser_init.add_argument(
-        '--template', dest='template', choices=available_templates,
-        help='Choose from predefined project templates'
-    )
-
-    # start sub-command parser
-
-    parser_start = subparsers.add_parser('start', help='Start Label Studio server', parents=[root_parser])
-    parser_start.add_argument(
-        'project_name',
-        help='Path to directory where project state has been initialized'
-    )
-    parser_start.add_argument(
-        '--init', dest='init', action='store_true',
-        help='Initialize if project is not initialized yet'
-    )
-    parser_start.add_argument(
-        '--template', dest='template', choices=available_templates,
-        help='Choose from predefined project templates'
-    )
-    parser_start.add_argument(
-        '-c', '--config', dest='config_path',
-        help='Server config')
-    parser_start.add_argument(
-        '-l', '--label-config', dest='label_config', default='',
-        help='Label config path')
-    parser_start.add_argument(
-        '-i', '--input-path', dest='input_path', default='',
-        help='Input path to task file or directory with tasks')
-    parser_start.add_argument(
-        '-o', '--output-dir', dest='output_dir', default='',
-        help='Output directory for completions')
-    parser_start.add_argument(
-        '-p', '--port', dest='port', default=8200, type=int,
-        help='Server port')
-    parser_start.add_argument(
-        '--ml-backend-url', dest='ml_backend_url',
-        help='Machine learning backend URL')
-    parser_start.add_argument(
-        '--ml-backend-name', dest='ml_backend_name',
-        help='Machine learning backend name')
-
-    # start-multi-session sub-command parser
-
-    parser_start_ms = subparsers.add_parser(
-        'start-multi-session', help='Start Label Studio server', parents=[root_parser])
-    parser_start_ms.add_argument(
-        '--template', dest='template', choices=available_templates,
-        help='Choose from predefined project templates'
-    )
-    parser_start_ms.add_argument(
-        '-c', '--config', dest='config_path',
-        help='Server config')
-    parser_start_ms.add_argument(
-        '-l', '--label-config', dest='label_config', default='',
-        help='Label config path')
-    parser_start_ms.add_argument(
-        '-i', '--input-path', dest='input_path', default='',
-        help='Input path to task file or directory with tasks')
-    parser_start_ms.add_argument(
-        '-o', '--output-dir', dest='output_dir', default='',
-        help='Output directory for completions')
-    parser_start_ms.add_argument(
-        '-p', '--port', dest='port', default=8200, type=int,
-        help='Server port')
-    parser_start_ms.add_argument(
-        '--ml-backend-url', dest='ml_backend_url',
-        help='Machine learning backend URL')
-    parser_start_ms.add_argument(
-        '--ml-backend-name', dest='ml_backend_name',
-        help='Machine learning backend name')
-
-    args = parser.parse_args()
-    label_config_explicitly_specified = hasattr(args, 'label_config') and args.label_config
-    if args.template and not label_config_explicitly_specified:
-        args.label_config = os.path.join(find_dir('examples'), args.template, 'config.xml')
-    if not hasattr(args, 'label_config'):
-        args.label_config = None
-    return args
-
-
 def main():
     import threading
     import webbrowser
 
-    import label_studio.utils.functions
-
     global input_args
 
     input_args = parse_input_args()
 
+    import label_studio.utils.functions
+    label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port)
+
     # On `init` command, create directory args.project_name with initial project state and exit
     if input_args.command == 'init':
         Project.create_project_dir(input_args.project_name, input_args)
@@ -770,8 +638,6 @@ def main():
         if input_args.init:
             Project.create_project_dir(input_args.project_name, input_args)
 
-    label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port)
-
     # On `start` command, launch browser if --no-browser is not specified and start label studio server
     if input_args.command == 'start':
         if not input_args.no_browser:
diff --git a/label_studio/tasks.py b/label_studio/tasks.py
new file mode 100644
index 00000000000..49ca5980707
--- /dev/null
+++ b/label_studio/tasks.py
@@ -0,0 +1,103 @@
+import orjson
+import os
+import io
+import urllib
+
+from label_studio.utils.io import iter_files
+
+
+class Tasks(object):
+
+    _allowed_extensions = {
+        'Text': ('.txt',),
+        'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'),
+        'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac')
+    }
+
+    def _create_task_with_local_uri(self, filepath, data_key, task_id):
+        """ Convert filepath to task with flask serving URL
+        """
+        from label_studio.utils.functions import HOSTNAME
+
+        filename = os.path.basename(filepath)
+        params = urllib.parse.urlencode({'d': os.path.dirname(filepath)})
+        base_url = HOSTNAME + '/'
+        image_url_path = base_url + urllib.parse.quote('data/' + filename)
+        image_local_url = '{image_url_path}?{params}'.format(image_url_path=image_url_path, params=params)
+        return {
+            'id': task_id,
+            'task_path': filepath,
+            'data': {data_key: image_local_url}
+        }
+
+    def from_dict(self, d, task_id=0):
+        task = {}
+        data = d['data'] if 'data' in d else d
+        task[task_id] = {'id': task_id, 'data': data}
+        if 'predictions' in data:
+            task[task_id]['predictions'] = data['predictions']
+            task[task_id]['data'].pop('predictions', None)
+        if 'predictions' in d:
+            task[task_id]['predictions'] = d['predictions']
+        return task
+
+    def from_list_of_dicts(self, l, start_task_id=0):
+        tasks = {}
+        for i, t in enumerate(l):
+            tasks.update(self.from_dict(t, start_task_id + i))
+        return tasks
+
+    def from_json_file(self, path, start_task_id=0):
+        with open(path) as f:
+            json_body = orjson.loads(f.read())
+
+            # multiple tasks in file
+            if isinstance(json_body, list):
+                tasks = {}
+                task_id = start_task_id
+                for d in json_body:
+                    tasks.update(self.from_dict(d, task_id))
+                    task_id += 1
+                return tasks
+
+            # one task in file
+            elif isinstance(json_body, dict):
+                tasks = self.from_dict(json_body, start_task_id)
+                return tasks
+
+            # unsupported task type
+            else:
+                raise Exception('Unsupported task data:', path)
+
+    def from_dir_with_json_files(self, path):
+        tasks = {}
+        for f in iter_files(path, ext='.json'):
+            tasks.update(self.from_json_file(f, start_task_id=len(tasks)))
+        return tasks
+
+    def from_text_file(self, path, data_key, start_task_id=0):
+        tasks = {}
+        task_id = start_task_id
+        with io.open(path) as f:
+            for line in f:
+                tasks[task_id] = {'id': task_id, 'data': {data_key: line.strip()}}
+        return tasks
+
+    def from_dir_with_text_files(self, path, data_key):
+        tasks = {}
+        for f in iter_files(path, ext=''):
+            tasks.update(self.from_text_file(f, data_key, start_task_id=len(tasks)))
+        return tasks
+
+    def _from_dir_with_local_resources(self, path, data_key, data_type):
+        tasks = {}
+        for f in iter_files(path, ext=self._allowed_extensions[data_type]):
+            task_id = len(tasks) + 1
+            tasks[task_id] = self._create_task_with_local_uri(f, data_key, task_id)
+        return tasks
+
+    def from_dir_with_image_files(self, path, data_key):
+        return self._from_dir_with_local_resources(path, data_key, 'Image')
+
+    def from_dir_with_audio_files(self, path, data_key):
+        return self._from_dir_with_local_resources(path, data_key, 'Audio')
diff --git a/label_studio/templates/import_help.html b/label_studio/templates/import_help.html
index a14558ce1ca..eb514cdc2e7 100644
--- a/label_studio/templates/import_help.html
+++ b/label_studio/templates/import_help.html
@@ -3,7 +3,7 @@
   <div class="header active title"><i class="dropdown icon"></i>Import formats and examples</div>
   <div class="content active">
     You can use open source datasets and build tasks in
-    <a class="no-go" target="_blank" href='https://data.heartex.net/?data_types={{ project.data_types_json }}'>
+    <a class="no-go" target="_blank" href='https://data.heartex.net/?data_types={{ project.data_types_json }}&ref=label-studio&p={{ project.name }}'>
       Heartex.Datasets
     </a>.
     <br/><br/>
diff --git a/label_studio/utils/analytics.py b/label_studio/utils/analytics.py
index b22207bc573..9d397af8134 100644
--- a/label_studio/utils/analytics.py
+++ b/label_studio/utils/analytics.py
@@ -2,10 +2,13 @@
 import os
 import io
 import requests
+import calendar
 
+from datetime import datetime
 from mixpanel import Mixpanel, MixpanelException
 from copy import deepcopy
 from operator import itemgetter
+
 from uuid import uuid4
 from .misc import get_app_version, parse_config, convert_string_to_hash
 from .io import get_config_dir
@@ -63,6 +66,9 @@ def _get_label_types(self):
             })
         return label_types
 
+    def _get_timestamp_now(self):
+        return calendar.timegm(datetime.now().timetuple())
+
     def update_info(self, label_config_line, collect_analytics=True, project_name='', context=None):
         if label_config_line != self._label_config_line:
             self._label_types = self._get_label_types()
@@ -86,9 +92,10 @@ def send(self, event_name, **kwargs):
 
         json_data = data
         json_data['event'] = event_name
-        json_data['user_id'] = self._user_id
+        json_data['server_id'] = self._user_id
+        json_data['server_time'] = self._get_timestamp_now()
         try:
-            url = 'https://analytics.labelstudio.io/prod'
+            url = 'https://analytics.labelstud.io/prod'
             logger.debug('Sending to {url}:\n{data}'.format(url=url, data=json_data))
             requests.post(url=url, json=json_data)
         except requests.RequestException as exc:
diff --git a/label_studio/utils/argparser.py b/label_studio/utils/argparser.py
new file mode 100644
index 00000000000..af1a4747acf
--- /dev/null
+++ b/label_studio/utils/argparser.py
@@ -0,0 +1,108 @@
+import os
+
+from label_studio.utils.io import find_dir
+from label_studio.utils.misc import iter_config_templates
+
+
+def parse_input_args():
+    """ Combine args with json config
+
+    :return: config dict
+    """
+    import sys
+    import argparse
+
+    if len(sys.argv) == 1:
+        print('\nQuick start usage: label-studio start my_project --init\n')
+
+    available_templates = [os.path.basename(os.path.dirname(f)) for f in iter_config_templates()]
+
+    def valid_filepath(filepath):
+        path = os.path.abspath(os.path.expanduser(filepath))
+        if os.path.exists(path):
+            return path
+        raise FileNotFoundError(filepath)
+
+    root_parser = argparse.ArgumentParser(add_help=False)
+    root_parser.add_argument(
+        '-b', '--no-browser', dest='no_browser', action='store_true',
+        help='Do not open browser at label studio start')
+    root_parser.add_argument(
+        '-d', '--debug', dest='debug', action='store_true',
+        help='Debug mode for Flask', default=None)
+    root_parser.add_argument(
+        '--force', dest='force', action='store_true',
+        help='Force creation new resources if exist')
+    root_parser.add_argument(
+        '--root-dir', dest='root_dir', default='.',
+        help='Projects root directory')
+    root_parser.add_argument(
+        '-v', '--verbose', dest='verbose', action='store_true',
+        help='Increase output verbosity')
+    root_parser.add_argument(
+        '--template', dest='template', choices=available_templates,
+        help='Choose from predefined project templates')
+    root_parser.add_argument(
+        '-c', '--config', dest='config_path', type=valid_filepath,
+        help='Server config')
+    root_parser.add_argument(
+        '-l', '--label-config', dest='label_config', type=valid_filepath,
+        help='Label config path')
+    root_parser.add_argument(
+        '-i', '--input-path', dest='input_path', type=valid_filepath,
+        help='Input path to task file or directory with tasks')
+    root_parser.add_argument(
+        '--input-format', dest='input_format',
+        choices=('json', 'json-dir', 'text', 'text-dir', 'image-dir', 'audio-dir'), default='json',
+        help='Input path to task file or directory with tasks')
+    root_parser.add_argument(
+        '-o', '--output-dir', dest='output_dir', type=valid_filepath,
+        help='Output directory for completions')
+    root_parser.add_argument(
+        '--ml-backend-url', dest='ml_backend_url',
+        help='Machine learning backend URL')
+    root_parser.add_argument(
+        '--ml-backend-name', dest='ml_backend_name',
+        help='Machine learning backend name')
+    root_parser.add_argument(
+        '-p', '--port', dest='port', default=8200, type=int,
+        help='Server port')
+
+    parser = argparse.ArgumentParser(description='Label studio')
+
+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
+    subparsers.required = True
+
+    # init sub-command parser
+
+    parser_init = subparsers.add_parser('init', help='Initialize Label Studio', parents=[root_parser])
+    parser_init.add_argument(
+        'project_name',
+        help='Path to directory where project state will be initialized')
+
+    # start sub-command parser
+
+    parser_start = subparsers.add_parser('start', help='Start Label Studio server', parents=[root_parser])
+    parser_start.add_argument(
+        'project_name',
+        help='Path to directory where project state has been initialized')
+    parser_start.add_argument(
+        '--init', dest='init', action='store_true',
+        help='Initialize if project is not initialized yet')
+
+    # start-multi-session sub-command parser
+
+    parser_start_ms = subparsers.add_parser(
+        'start-multi-session', help='Start Label Studio server', parents=[root_parser])
+
+    args = parser.parse_args()
+    if args.output_dir is not None:
+        raise RuntimeError('"--output-dir" option is deprecated and has no effect.\n'
+                           'All output results are saved to project_name/completions directory')
+
+    label_config_explicitly_specified = hasattr(args, 'label_config') and args.label_config
+    if args.template and not label_config_explicitly_specified:
+        args.label_config = os.path.join(find_dir('examples'), args.template, 'config.xml')
+    if not hasattr(args, 'label_config'):
+        args.label_config = None
+    return args
diff --git a/label_studio/utils/io.py b/label_studio/utils/io.py
index 4d3f3ee3d3e..3bd02ef9a02 100644
--- a/label_studio/utils/io.py
+++ b/label_studio/utils/io.py
@@ -2,6 +2,8 @@
 import pkg_resources
 import shutil
 import glob
+import io
+import json
 
 from contextlib import contextmanager
 from tempfile import mkstemp, mkdtemp
@@ -84,3 +86,22 @@ def get_data_dir():
 def delete_dir_content(dirpath):
     for f in glob.glob(dirpath + '/*'):
         os.remove(f)
+
+
+def remove_file_or_dir(path):
+    if os.path.isfile(path):
+        os.remove(path)
+    elif os.path.isdir(path):
+        shutil.rmtree(path)
+
+
+def iter_files(root_dir, ext):
+    for root, _, files in os.walk(root_dir):
+        for f in files:
+            if f.lower().endswith(ext):
+                yield os.path.join(root, f)
+
+
+def json_load(file):
+    with io.open(file) as f:
+        return json.load(f)
diff --git a/label_studio/utils/misc.py b/label_studio/utils/misc.py
index 7ddb0a43f9b..469461a70c0 100644
--- a/label_studio/utils/misc.py
+++ b/label_studio/utils/misc.py
@@ -138,27 +138,6 @@ def config_comments_free(xml_config):
     return xml_config
 
 
-class LabelConfigParser(object):
-
-    def __init__(self, filepath):
-        with io.open(filepath) as f:
-            self._config = f.read()
-
-    def get_value_for_name(self, name):
-        tag_iter = ElementTree.fromstring(self._config).iter()
-        return next((
-            tag.attrib.get('value') for tag in tag_iter
-            if tag.attrib.get('name') == name), None
-        )
-
-    def get_input_data_tags(self):
-        tag_iter = ElementTree.fromstring(self._config).iter()
-        return [
-            tag for tag in tag_iter
-            if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$')
-        ]
-
-
 def get_app_version():
     return pkg_resources.get_distribution('label-studio').version
 
diff --git a/label_studio/utils/schema/default_config.json b/label_studio/utils/schema/default_config.json
new file mode 100644
index 00000000000..2a0de94d421
--- /dev/null
+++ b/label_studio/utils/schema/default_config.json
@@ -0,0 +1,23 @@
+{
+    "title": "Label Studio",
+    "port": 8200,
+    "debug": false,
+
+    "label_config": null,
+    "input_path": null,
+    "output_dir": "completions",
+
+    "instruction": "Type some <b>hypertext</b> for label experts!",
+    "allow_delete_completions": true,
+    "templates_dir": "examples",
+
+    "editor": {
+        "debug": false
+    },
+
+    "!ml_backend": {
+        "url": "http://localhost:9090",
+        "model_name": "my_super_model"
+    },
+    "sampling": "uniform"
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4de6f28e9e6..318746efd28 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 import setuptools
 
 # Package version
-version = '0.4.4.post2'
+version = '0.4.5'
 
 # Readme
 with open('README.md', 'r') as f: