-
Notifications
You must be signed in to change notification settings - Fork 3
/
docker-compose.yml
210 lines (194 loc) · 5.27 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
version: "3.8"
x-airflow-common: &airflow-common
image: open-dataplatform-airflow
volumes:
- ./airflow/config/airflow.cfg:/opt/airflow/airflow.cfg
- ./airflow/dags:/opt/airflow/dags
- ./airflow/plugins:/opt/airflow/plugins
- ./airflow/spark:/scripts/spark
- ./hadoop/conf:/opt/hadoop/conf
- ./spark/config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
environment:
POSTGRES_HOST: airflow-db
POSTGRES_PORT: 5432
HDFS_HOST: namenode
HDFS_PORT: "8020"
INGESTION_API_URL: "http://api:3000"
SCRIPTS_PATH_PREFIX: /scripts/spark/
SERVICE_PRECONDITION: airflow-db:5432
depends_on:
- airflow-db
services:
# Hadoop
namenode:
container_name: namenode
hostname: namenode
image: open-dataplatform-hadoop-hdfs
environment:
NODE_TYPE: namenode
HDFS_NAMENODE_USER: root
CLUSTER_NAME: hadoop-cluster
volumes:
- namenode:/hadoop/dfs/namenode
- ./hadoop/conf:/etc/hadoop
ports:
- 50070:50070
datanode:
container_name: datanode
hostname: datanode
image: open-dataplatform-hadoop-hdfs
environment:
NODE_TYPE: datanode
HDFS_DATANODE_USER: root
volumes:
- datanode:/hadoop/dfs/datanode
- ./hadoop/conf:/etc/hadoop
ports:
- 50075:50075
depends_on:
- namenode
# hive
hive-postgres:
image: postgres:9.4
container_name: hive-postgres
hostname: hive-postgres
ports:
- 5433:5432
volumes:
- ./tmp/postgres-hive:/var/lib/postgresql/data
environment:
POSTGRES_PASSWORD: hive
POSTGRES_USER: hive
POSTGRES_DB: metastore
hive-metastore:
hostname: hive-metastore
image: open-dataplatform-hive
container_name: hive-metastore
depends_on:
- hive-postgres
- datanode
volumes:
- ./hadoop/conf:/etc/hadoop
- ./hadoop/conf/hive-site.xml:/opt/hive/conf/hive-site.xml
command: "hive --service metastore"
environment:
NODE_TYPE: metastore
SERVICE_PRECONDITION: hive-postgres:5432 namenode:50070
ports:
- 9083:9083
hive-server:
hostname: hive-server
image: open-dataplatform-hive
container_name: hive-server
depends_on:
- hive-metastore
volumes:
- ./hadoop/conf:/etc/hadoop
- ./hadoop/conf/hive-site.xml:/opt/hive/conf/hive-site.xml
command: "hiveserver2"
ports:
- 10002:10002
# Spark
spark-master:
container_name: spark-master
image: open-dataplatform-spark
ports:
- 4040:4040
- 4041:4041
command:
- master
environment:
SPARK_MASTER_HOST: spark-master
SPARK_MASTER_PORT: 7077
SPARK_MASTER_WEBUI_PORT: 4041
volumes:
- ./hadoop/conf:/opt/hadoop/conf
- ./spark/config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- ./spark/applications:/scripts
spark-worker:
hostname: spark-worker
container_name: spark-worker
image: open-dataplatform-spark
ports:
- 4042:4040
command:
- worker
depends_on:
- spark-master
environment:
SPARK_WORKER_WEBUI_PORT: 4040
SPARK_MASTER_URL: spark-master:7077
volumes:
- ./hadoop/conf:/opt/hadoop/conf
- ./spark/config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
# Airflow
airflow-db:
hostname: airflow-db
container_name: airflow_db
image: postgres
restart: always
environment:
POSTGRES_PASSWORD: airflow
POSTGRES_USER: airflow
POSTGRES_DB: airflow
volumes:
- ./tmp/postgres-airflow:/var/lib/postgresql/data
ports:
- 5432:5432
airflow-webserver:
hostname: airflow-webserver
container_name: airflow_webserver
<<: *airflow-common
command: "webserver -p 9090"
ports:
- "8793:8793"
- "9090:9090"
airflow-scheduler:
hostname: airflow-scheduler
container_name: airflow_scheduler
<<: *airflow-common
command: "scheduler"
# trino
trino:
image: trinodb/trino:353
container_name: trino
ports:
- 8089:8080
volumes:
- ./trino/config/catalog:/etc/trino/catalog
- ./trino/config/jvm.config:/etc/trino/jvm.config
- ./trino/config/config.properties:/etc/trino/config.properties
sqlpad:
image: sqlpad/sqlpad:6.7.1
container_name: sqlpad
ports:
- "3001:3000"
environment:
SQLPAD_CONNECTIONS__datalake__name: DataLake
SQLPAD_CONNECTIONS__datalake__driver: trino
SQLPAD_CONNECTIONS__datalake__host: trino
SQLPAD_CONNECTIONS__datalake__username: sqlpad
SQLPAD_CONNECTIONS__datalake__catalog: hive
SQLPAD_CONNECTIONS__clickhouse__name: Clickhouse
SQLPAD_CONNECTIONS__clickhouse__driver: clickhouse
SQLPAD_CONNECTIONS__clickhouse__host: clickhouse
SQLPAD_CONNECTIONS__clickhouse__username: sqlpad
SQLPAD_CONNECTIONS__clickhouse__password: sqlpad
SQLPAD_AUTH_DISABLED: "true"
SQLPAD_AUTH_DISABLED_DEFAULT_ROLE: "admin"
volumes:
- sqlpad:/etc/sqlpad/seed-data
clickhouse:
container_name: clickhouse
image: yandex/clickhouse-server:21.6.4.26
volumes:
- $PWD/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml
- clickhouse:/var/lib/clickhouse
volumes:
namenode:
datanode:
sqlpad:
clickhouse:
networks:
default:
name: open-dataplatform-network