diff --git a/README.md b/README.md index 554e3f02..585005b6 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ [![Waffle.io](https://badge.waffle.io/m-lab/etl-gardener.svg?title=Ready)](http://waffle.io/m-lab/etl-gardener) - ## Gardener provides services for maintaining and reprocessing mlab data. ## Unit Testing diff --git a/cloud/bq/dedup.go b/cloud/bq/dedup.go index 2476c86a..2b5cf350 100644 --- a/cloud/bq/dedup.go +++ b/cloud/bq/dedup.go @@ -201,6 +201,16 @@ var dedupTemplateTCPInfo = ` ) WHERE row_number = 1` +var dedupTemplateNDTLegacy = ` + #standardSQL + SELECT * EXCEPT (row_number) + FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY test_id ORDER BY ParseInfo.ParseTime DESC) AS row_number + FROM ` + "`%s`" + ` + ) + WHERE + row_number = 1` + // Dedup executes a query that dedups and writes to destination partition. // This function is alpha status. The interface may change without notice // or major version number change. @@ -234,6 +244,8 @@ func Dedup(ctx context.Context, dsExt *dataset.Dataset, src string, destTable bq queryString = fmt.Sprintf(dedupTemplateTraceroute, src) case strings.HasPrefix(destTable.TableID(), "tcpinfo"): queryString = fmt.Sprintf(dedupTemplateTCPInfo, src) + case strings.HasPrefix(destTable.TableID(), "legacy"): + queryString = fmt.Sprintf(dedupTemplateNDTLegacy, src) default: log.Println("Only handles sidestream, ndt, switch, traceroute, not " + destTable.TableID()) return nil, errors.New("Unknown table type") diff --git a/cloud/bq/sanity.go b/cloud/bq/sanity.go index c96fba16..d82aefd1 100644 --- a/cloud/bq/sanity.go +++ b/cloud/bq/sanity.go @@ -176,10 +176,20 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T %s -- where clause`, dataset, tableName, where) + legacyNDTQuery := fmt.Sprintf(` + #standardSQL + SELECT COUNT(DISTINCT test_id) AS TestCount, COUNT(DISTINCT ParseInfo.TaskFileName) AS TaskFileCount + FROM `+"`%s.%s`"+` + %s -- where clause`, + dataset, tableName, where) + // TODO - find a better way to do this. + // https://github.com/m-lab/etl-gardener/issues/158 query := legacyQuery if parts[0] == "tcpinfo" { query = tcpinfoQuery + } else if parts[0] == "legacy" { + query = legacyNDTQuery } err := dsExt.QueryAndParse(ctx, query, &detail) if err != nil { diff --git a/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml new file mode 100644 index 00000000..2e912127 --- /dev/null +++ b/k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml @@ -0,0 +1,99 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: etl-gardener-legacy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + # Used to match pre-existing pods that may be affected during updates. + run: etl-gardener-legacy + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + # Pod template. + template: + metadata: + labels: + # Note: run=etl-gardener-server should match a service config with a + # public IP and port so that it is publicly accessible. + run: etl-gardener-legacy + annotations: + # Tell prometheus service discovery to collect metrics from the containers. + prometheus.io/scrape: 'true' + spec: + # When container receives SIGTERM, it begins a new checkpoint. This can + # take longer than the default grace period of 30s. + terminationGracePeriodSeconds: 300 + + # Place the pod into the Guaranteed QoS by setting equal resource + # requests and limits for *all* containers in the pod. + # For more background, see: + # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md + containers: + - image: gcr.io/{{GCLOUD_PROJECT}}/github-m-lab-etl-gardener:{{GIT_COMMIT}} + name: etl-gardener + env: + - name: GARDENER_SERVICE + value: "true" + - name: GIT_COMMIT + value: "{{GIT_COMMIT}}" + - name: PROJECT + value: "{{GCLOUD_PROJECT}}" + # NOTE: We read archives from the public archive for all projects. + # TODO: Update when we address https://github.com/m-lab/dev-tracker/issues/369 + - name: TASKFILE_BUCKET + value: "pusher-{{GCLOUD_PROJECT}}" # This will work for sandbox/staging, but prod should use archive-measurement-lab. + - name: START_DATE + value: "20190513" + - name: DATE_SKIP # Should be 0 for normal operation + value: "0" + - name: TASK_FILE_SKIP # Should be 0 for normal operation + value: "0" + - name: EXPERIMENT + value: "ndt/legacy" + - name: DATASET + value: "batch" + - name: FINAL_DATASET + value: "base_tables" + - name: QUEUE_BASE + value: "etl-legacy-batch-" + - name: NUM_QUEUES + value: "2" + + ports: + - name: prometheus-port + containerPort: 9090 + - name: service-port + containerPort: 8080 + + livenessProbe: + httpGet: + path: /alive + port: service-port + initialDelaySeconds: 30 + periodSeconds: 60 + + resources: + requests: + memory: "3Gi" + cpu: "1" + limits: + memory: "3Gi" + cpu: "1" + + volumeMounts: + - mountPath: /volume-claim + name: legacy-storage + + nodeSelector: + gardener-node: "true" + + volumes: + - name: legacy-storage + persistentVolumeClaim: + claimName: gardener-legacy-disk0 + diff --git a/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml b/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml index b61409b1..36c52c92 100644 --- a/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml +++ b/k8s/data-processing-cluster/persistentvolumes/persistent-volumes.yml @@ -49,3 +49,16 @@ spec: resources: requests: storage: 10Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: gardener-legacy-disk0 + annotations: + volume.beta.kubernetes.io/storage-class: "slow" +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml b/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml new file mode 100644 index 00000000..92b78111 --- /dev/null +++ b/k8s/data-processing-cluster/services/etl-gardener-legacy-service.yml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: etl-gardener-legacy-service + namespace: default +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 8080 + selector: + run: etl-gardener-legacy + sessionAffinity: None + type: LoadBalancer