Skip to content

Commit

Permalink
Merge pull request #160 from m-lab/sandbox-soltesz-add-ndt-legacy
Browse files Browse the repository at this point in the history
Add ndt legacy configuration
  • Loading branch information
stephen-soltesz committed Jun 19, 2019
2 parents b7b655c + 4296759 commit 4306d2d
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 1 deletion.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
[![Waffle.io](https://badge.waffle.io/m-lab/etl-gardener.svg?title=Ready)](http://waffle.io/m-lab/etl-gardener)



## Gardener provides services for maintaining and reprocessing mlab data.

## Unit Testing
Expand Down
12 changes: 12 additions & 0 deletions cloud/bq/dedup.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ var dedupTemplateTCPInfo = `
)
WHERE row_number = 1`

var dedupTemplateNDTLegacy = `
#standardSQL
SELECT * EXCEPT (row_number)
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY test_id ORDER BY ParseInfo.ParseTime DESC) AS row_number
FROM ` + "`%s`" + `
)
WHERE
row_number = 1`

// Dedup executes a query that dedups and writes to destination partition.
// This function is alpha status. The interface may change without notice
// or major version number change.
Expand Down Expand Up @@ -234,6 +244,8 @@ func Dedup(ctx context.Context, dsExt *dataset.Dataset, src string, destTable bq
queryString = fmt.Sprintf(dedupTemplateTraceroute, src)
case strings.HasPrefix(destTable.TableID(), "tcpinfo"):
queryString = fmt.Sprintf(dedupTemplateTCPInfo, src)
case strings.HasPrefix(destTable.TableID(), "legacy"):
queryString = fmt.Sprintf(dedupTemplateNDTLegacy, src)
default:
log.Println("Only handles sidestream, ndt, switch, traceroute, not " + destTable.TableID())
return nil, errors.New("Unknown table type")
Expand Down
10 changes: 10 additions & 0 deletions cloud/bq/sanity.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,20 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T
%s -- where clause`,
dataset, tableName, where)

legacyNDTQuery := fmt.Sprintf(`
#standardSQL
SELECT COUNT(DISTINCT test_id) AS TestCount, COUNT(DISTINCT ParseInfo.TaskFileName) AS TaskFileCount
FROM `+"`%s.%s`"+`
%s -- where clause`,
dataset, tableName, where)

// TODO - find a better way to do this.
// https://github.com/m-lab/etl-gardener/issues/158
query := legacyQuery
if parts[0] == "tcpinfo" {
query = tcpinfoQuery
} else if parts[0] == "legacy" {
query = legacyNDTQuery
}
err := dsExt.QueryAndParse(ctx, query, &detail)
if err != nil {
Expand Down
99 changes: 99 additions & 0 deletions k8s/data-processing-cluster/deployments/etl-gardener-legacy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: etl-gardener-legacy
namespace: default
spec:
replicas: 1
selector:
matchLabels:
# Used to match pre-existing pods that may be affected during updates.
run: etl-gardener-legacy
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
type: RollingUpdate
# Pod template.
template:
metadata:
labels:
# Note: run=etl-gardener-server should match a service config with a
# public IP and port so that it is publicly accessible.
run: etl-gardener-legacy
annotations:
# Tell prometheus service discovery to collect metrics from the containers.
prometheus.io/scrape: 'true'
spec:
# When container receives SIGTERM, it begins a new checkpoint. This can
# take longer than the default grace period of 30s.
terminationGracePeriodSeconds: 300

# Place the pod into the Guaranteed QoS by setting equal resource
# requests and limits for *all* containers in the pod.
# For more background, see:
# https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md
containers:
- image: gcr.io/{{GCLOUD_PROJECT}}/github-m-lab-etl-gardener:{{GIT_COMMIT}}
name: etl-gardener
env:
- name: GARDENER_SERVICE
value: "true"
- name: GIT_COMMIT
value: "{{GIT_COMMIT}}"
- name: PROJECT
value: "{{GCLOUD_PROJECT}}"
# NOTE: We read archives from the public archive for all projects.
# TODO: Update when we address https://github.com/m-lab/dev-tracker/issues/369
- name: TASKFILE_BUCKET
value: "pusher-{{GCLOUD_PROJECT}}" # This will work for sandbox/staging, but prod should use archive-measurement-lab.
- name: START_DATE
value: "20190513"
- name: DATE_SKIP # Should be 0 for normal operation
value: "0"
- name: TASK_FILE_SKIP # Should be 0 for normal operation
value: "0"
- name: EXPERIMENT
value: "ndt/legacy"
- name: DATASET
value: "batch"
- name: FINAL_DATASET
value: "base_tables"
- name: QUEUE_BASE
value: "etl-legacy-batch-"
- name: NUM_QUEUES
value: "2"

ports:
- name: prometheus-port
containerPort: 9090
- name: service-port
containerPort: 8080

livenessProbe:
httpGet:
path: /alive
port: service-port
initialDelaySeconds: 30
periodSeconds: 60

resources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "3Gi"
cpu: "1"

volumeMounts:
- mountPath: /volume-claim
name: legacy-storage

nodeSelector:
gardener-node: "true"

volumes:
- name: legacy-storage
persistentVolumeClaim:
claimName: gardener-legacy-disk0

Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,16 @@ spec:
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: gardener-legacy-disk0
annotations:
volume.beta.kubernetes.io/storage-class: "slow"
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: etl-gardener-legacy-service
namespace: default
spec:
ports:
- port: 8080
protocol: TCP
targetPort: 8080
selector:
run: etl-gardener-legacy
sessionAffinity: None
type: LoadBalancer

0 comments on commit 4306d2d

Please sign in to comment.