Skip to content

Commit 68549f8

Browse files
author
David Cavazos
authored
dataflow/custom-containers: add readmes and support cli options (GoogleCloudPlatform#7601)
* add readmes and support cli options * run from run.yaml file * update tests to run from run.yaml * launch jobs asynchronously, do not wait until finish * add googleapiclient library * remove subprocess unused import * remove unused substitution * specify image tag with uuid for tests * clean commands
1 parent 3f81e36 commit 68549f8

27 files changed

+452
-95
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Ignore everything except the source files.
2+
**/*
3+
!Dockerfile
4+
!requirements.txt
5+
!*.py

dataflow/custom-containers/miniconda/Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ FROM continuumio/miniconda3:4.10.3-alpine AS builder
1818

1919
# Create a virtual environment and make it standalone with conda-pack.
2020
# https://conda.github.io/conda-pack
21-
RUN conda create -y -n env python=3.8 \
21+
RUN conda create -y -n env python=3.9 \
2222
&& conda install -y conda-pack \
2323
&& conda-pack -n env -o /tmp/env.tar \
2424
&& mkdir /opt/python \
@@ -28,8 +28,10 @@ RUN conda create -y -n env python=3.8 \
2828
# -- Main image
2929
FROM ubuntu:latest
3030

31+
WORKDIR /pipeline
32+
3133
# Set the entrypoint to Apache Beam SDK worker launcher.
32-
COPY --from=apache/beam_python3.8_sdk:2.37.0 /opt/apache/beam /opt/apache/beam
34+
COPY --from=apache/beam_python3.9_sdk:2.37.0 /opt/apache/beam /opt/apache/beam
3335
ENTRYPOINT [ "/opt/apache/beam/boot" ]
3436

3537
# Copy the python installation from the builder stage.
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Miniconda custom container
2+
3+
This sample runs a Dataflow pipeline where the workers use a custom container with a
4+
[Miniconda](https://docs.conda.io/en/latest/miniconda.html)
5+
Python environment.
6+
7+
## Before you begin
8+
9+
Make sure you have followed the
10+
[Dataflow setup instructions](../../README.md).
11+
12+
You will also need:
13+
* A Cloud Storage bucket, click [here to create one](https://console.cloud.google.com/storage/create-bucket) if needed.
14+
15+
```sh
16+
export PROJECT=$(gcloud config get-value project)
17+
export BUCKET="my-cloud-storage-bucket"
18+
```
19+
20+
> ℹ️ Make sure your `BUCKET` name does _not_ include the `gs://` prefix.
21+
22+
## Building the Docker image
23+
24+
We use
25+
[Cloud Build](https://cloud.google.com/build)
26+
to build the container image for the workers and save it in
27+
[Container Registry](https://cloud.google.com/container-registry/).
28+
29+
```sh
30+
export IMAGE=gcr.io/$PROJECT/samples/dataflow-miniconda:latest
31+
32+
gcloud builds submit . --tag=$IMAGE
33+
```
34+
35+
## Running the Dataflow job
36+
37+
We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job.
38+
39+
The [`run.yaml`](run.yaml) file contains the command we use to launch the Dataflow job.
40+
41+
> ℹ️ We launch the job using the worker image to make sure the job launches
42+
> with the same Python version as the workers and all the dependencies installed.
43+
44+
```sh
45+
# Choose the location where you want to run your Dataflow job.
46+
# For a list of all supported locations, see:
47+
# https://cloud.google.com/dataflow/docs/resources/locations
48+
export REGION="us-central1"
49+
50+
export JOB_NAME="dataflow-miniconda-$(date +"%F-%H%M%S")"
51+
export TEMP_LOCATION="gs://$BUCKET/samples/dataflow-miniconda"
52+
53+
gcloud builds submit \
54+
--config run.yaml \
55+
--substitutions "_JOB_NAME=$JOB_NAME,_REGION=$REGION,_TEMP_LOCATION=$TEMP_LOCATION" \
56+
--no-source
57+
```

dataflow/custom-containers/miniconda/e2e_test.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import subprocess
18-
1917
try:
2018
# `conftest` cannot be imported when running in `nox`, but we still
2119
# try to import it for the autocomplete when writing the tests.
@@ -34,21 +32,24 @@ def bucket_name(utils: Utils) -> str:
3432

3533
@pytest.fixture(scope="session")
3634
def container_image(utils: Utils) -> str:
37-
yield from utils.cloud_build_submit(image_name=NAME)
38-
39-
40-
def test_tensorflow_minimal(
41-
utils: Utils, bucket_name: str, container_image: str
42-
) -> None:
43-
subprocess.check_call(
44-
[
45-
"python",
46-
"main.py",
47-
"--runner=DataflowRunner",
48-
f"--project={utils.project}",
49-
f"--region={utils.region}",
50-
f"--temp_location=gs://{bucket_name}",
51-
f"--sdk_container_image={container_image}",
52-
"--experiment=use_runner_v2",
53-
]
35+
yield from utils.cloud_build_submit(NAME)
36+
37+
38+
@pytest.fixture(scope="session")
39+
def run_dataflow_job(utils: Utils, bucket_name: str, container_image: str) -> str:
40+
yield from utils.cloud_build_submit(
41+
config="run.yaml",
42+
substitutions={
43+
"_IMAGE": container_image,
44+
"_JOB_NAME": utils.hyphen_name(NAME),
45+
"_TEMP_LOCATION": f"gs://{bucket_name}/temp",
46+
"_REGION": utils.region,
47+
},
48+
source="--no-source",
5449
)
50+
51+
52+
def test_custom_container_miniconda(utils: Utils, run_dataflow_job: str) -> None:
53+
# Wait until the job finishes.
54+
job_id = utils.dataflow_job_id(utils.hyphen_name(NAME))
55+
utils.dataflow_jobs_wait(job_id)

dataflow/custom-containers/miniconda/main.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,21 @@
1616

1717
import logging
1818
import platform
19+
from typing import List, Optional
1920

2021
import apache_beam as beam
22+
from apache_beam.options.pipeline_options import PipelineOptions
2123

2224

23-
def run() -> None:
24-
with beam.Pipeline() as pipeline:
25-
(
26-
pipeline
27-
| "Create data" >> beam.Create(["Hello", "World!", platform.platform()])
28-
| "Print" >> beam.Map(logging.info)
29-
)
25+
def run(beam_args: Optional[List[str]] = None) -> None:
26+
beam_options = PipelineOptions(beam_args, save_main_session=True)
27+
pipeline = beam.Pipeline(options=beam_options)
28+
(
29+
pipeline
30+
| "Create data" >> beam.Create(["Hello", "World!", platform.platform()])
31+
| "Print" >> beam.Map(logging.info)
32+
)
33+
pipeline.run()
3034

3135

3236
if __name__ == "__main__":

dataflow/custom-containers/miniconda/noxfile_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222

2323
TEST_CONFIG_OVERRIDE = {
2424
# You can opt out from the test for specific Python versions.
25-
# > ℹ️ We're opting out of all Python versions except 3.8.
25+
# > ℹ️ We're opting out of all Python versions except 3.9.
2626
# > The Python version used is defined by the Dockerfile, so it's redundant
2727
# > to run multiple tests since they would all be running the same Dockerfile.
28-
"ignored_versions": ["2.7", "3.6", "3.7", "3.9", "3.10"],
28+
"ignored_versions": ["2.7", "3.6", "3.7", "3.8", "3.10"],
2929
# Old samples are opted out of enforcing Python type hints
3030
# All new samples should feature them
3131
"enforce_type_hints": True,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
google-api-python-client==2.40.0
12
google-cloud-storage==1.43.0
23
pytest-xdist==2.5.0
34
pytest==6.2.4
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
apache-beam[gcp]==2.36.0
1+
apache-beam[gcp]==2.37.0
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This Cloud Build config runs a Dataflow job using GPUs.
16+
# We use the same worker image to launch the job.
17+
# That way we guarantee the same Python version for the workers.
18+
# It also already has all the requirements installed.
19+
20+
# -----------------------------------------------------------------------------
21+
# To learn more about this file:
22+
# https://cloud.google.com/build/docs/build-config
23+
#
24+
# To learn more about Cloud Build variable substitutions:
25+
# https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_user-defined_substitutions
26+
# -----------------------------------------------------------------------------
27+
28+
substitutions:
29+
_JOB_NAME: ''
30+
_TEMP_LOCATION: ''
31+
_REGION: us-central1
32+
_IMAGE: samples/dataflow-miniconda:latest
33+
34+
steps:
35+
- name: gcr.io/$PROJECT_ID/$_IMAGE
36+
entrypoint: python
37+
args:
38+
- /pipeline/main.py
39+
- --runner=DataflowRunner
40+
- --project=$PROJECT_ID
41+
- --region=$_REGION
42+
- --job_name=$_JOB_NAME
43+
- --temp_location=$_TEMP_LOCATION
44+
- --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE
45+
46+
options:
47+
logging: CLOUD_LOGGING_ONLY
48+
49+
# Use the Compute Engine default service account to launch the job.
50+
serviceAccount: projects/$PROJECT_ID/serviceAccounts/[email protected]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Ignore everything except the source files.
2+
**/*
3+
!Dockerfile
4+
!requirements.txt
5+
!*.py

0 commit comments

Comments
 (0)