Skip to content

Commit ec4dcce

Browse files
authored
Update sample dag and doc for Datasync (#23511)
1 parent 5d1e6ff commit ec4dcce

File tree

4 files changed

+82
-271
lines changed

4 files changed

+82
-271
lines changed

airflow/providers/amazon/aws/example_dags/example_datasync_2.py renamed to airflow/providers/amazon/aws/example_dags/example_datasync.py

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -14,47 +14,21 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
"""
18-
This is an example dag for using `AWSDataSyncOperator` in a more complex manner.
19-
20-
- Try to get a TaskArn. If one exists, update it.
21-
- If no tasks exist, try to create a new DataSync Task.
22-
- If source and destination locations don't exist for the new task, create them first
23-
- If many tasks exist, raise an Exception
24-
- After getting or creating a DataSync Task, run it
25-
26-
This DAG relies on the following environment variables:
27-
28-
* SOURCE_LOCATION_URI - Source location URI, usually on premises SMB or NFS
29-
* DESTINATION_LOCATION_URI - Destination location URI, usually S3
30-
* CREATE_TASK_KWARGS - Passed to boto3.create_task(**kwargs)
31-
* CREATE_SOURCE_LOCATION_KWARGS - Passed to boto3.create_location(**kwargs)
32-
* CREATE_DESTINATION_LOCATION_KWARGS - Passed to boto3.create_location(**kwargs)
33-
* UPDATE_TASK_KWARGS - Passed to boto3.update_task(**kwargs)
34-
"""
3517

3618
import json
3719
import re
3820
from datetime import datetime
3921
from os import getenv
4022

4123
from airflow import models
24+
from airflow.models.baseoperator import chain
4225
from airflow.providers.amazon.aws.operators.datasync import DataSyncOperator
4326

44-
# [START howto_operator_datasync_2_args]
27+
TASK_ARN = getenv("TASK_ARN", "my_aws_datasync_task_arn")
4528
SOURCE_LOCATION_URI = getenv("SOURCE_LOCATION_URI", "smb://hostname/directory/")
46-
4729
DESTINATION_LOCATION_URI = getenv("DESTINATION_LOCATION_URI", "s3://mybucket/prefix")
48-
49-
default_create_task_kwargs = '{"Name": "Created by Airflow"}'
50-
CREATE_TASK_KWARGS = json.loads(getenv("CREATE_TASK_KWARGS", default_create_task_kwargs))
51-
52-
default_create_source_location_kwargs = "{}"
53-
CREATE_SOURCE_LOCATION_KWARGS = json.loads(
54-
getenv("CREATE_SOURCE_LOCATION_KWARGS", default_create_source_location_kwargs)
55-
)
56-
57-
bucket_access_role_arn = "arn:aws:iam::11112223344:role/r-11112223344-my-bucket-access-role"
30+
CREATE_TASK_KWARGS = json.loads(getenv("CREATE_TASK_KWARGS", '{"Name": "Created by Airflow"}'))
31+
CREATE_SOURCE_LOCATION_KWARGS = json.loads(getenv("CREATE_SOURCE_LOCATION_KWARGS", '{}'))
5832
default_destination_location_kwargs = """\
5933
{"S3BucketArn": "arn:aws:s3:::mybucket",
6034
"S3Config": {"BucketAccessRoleArn":
@@ -63,23 +37,33 @@
6337
CREATE_DESTINATION_LOCATION_KWARGS = json.loads(
6438
getenv("CREATE_DESTINATION_LOCATION_KWARGS", re.sub(r"[\s+]", '', default_destination_location_kwargs))
6539
)
66-
67-
default_update_task_kwargs = '{"Name": "Updated by Airflow"}'
68-
UPDATE_TASK_KWARGS = json.loads(getenv("UPDATE_TASK_KWARGS", default_update_task_kwargs))
69-
70-
# [END howto_operator_datasync_2_args]
40+
UPDATE_TASK_KWARGS = json.loads(getenv("UPDATE_TASK_KWARGS", '{"Name": "Updated by Airflow"}'))
7141

7242
with models.DAG(
73-
"example_datasync_2",
43+
"example_datasync",
7444
schedule_interval=None, # Override to match your needs
7545
start_date=datetime(2021, 1, 1),
7646
catchup=False,
7747
tags=['example'],
7848
) as dag:
49+
# [START howto_operator_datasync_specific_task]
50+
# Execute a specific task
51+
datasync_specific_task = DataSyncOperator(task_id="datasync_specific_task", task_arn=TASK_ARN)
52+
# [END howto_operator_datasync_specific_task]
53+
54+
# [START howto_operator_datasync_search_task]
55+
# Search and execute a task
56+
datasync_search_task = DataSyncOperator(
57+
task_id="datasync_search_task",
58+
source_location_uri=SOURCE_LOCATION_URI,
59+
destination_location_uri=DESTINATION_LOCATION_URI,
60+
)
61+
# [END howto_operator_datasync_search_task]
7962

80-
# [START howto_operator_datasync_2]
81-
datasync_task = DataSyncOperator(
82-
task_id="datasync_task",
63+
# [START howto_operator_datasync_create_task]
64+
# Create a task (the task does not exist)
65+
datasync_create_task = DataSyncOperator(
66+
task_id="datasync_create_task",
8367
source_location_uri=SOURCE_LOCATION_URI,
8468
destination_location_uri=DESTINATION_LOCATION_URI,
8569
create_task_kwargs=CREATE_TASK_KWARGS,
@@ -88,4 +72,10 @@
8872
update_task_kwargs=UPDATE_TASK_KWARGS,
8973
delete_task_after_execution=True,
9074
)
91-
# [END howto_operator_datasync_2]
75+
# [END howto_operator_datasync_create_task]
76+
77+
chain(
78+
datasync_specific_task,
79+
datasync_search_task,
80+
datasync_create_task,
81+
)

airflow/providers/amazon/aws/example_dags/example_datasync_1.py

Lines changed: 0 additions & 69 deletions
This file was deleted.

airflow/providers/amazon/aws/operators/datasync.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232

3333
class DataSyncOperator(BaseOperator):
34-
r"""Find, Create, Update, Execute and Delete AWS DataSync Tasks.
34+
"""Find, Create, Update, Execute and Delete AWS DataSync Tasks.
3535
3636
If ``do_xcom_push`` is True, then the DataSync TaskArn and TaskExecutionArn
3737
which were executed will be pushed to an XCom.

0 commit comments

Comments
 (0)