Skip to content

Commit

Permalink
feat: Onboard GBIF dataset (#355)
Browse files Browse the repository at this point in the history
* yaml config files

* generate terraform

* generate DAG
  • Loading branch information
adlersantos committed May 12, 2022
1 parent 45dd0b2 commit ab4e208
Show file tree
Hide file tree
Showing 7 changed files with 248 additions and 0 deletions.
40 changes: 40 additions & 0 deletions datasets/gbif/infra/gbif_dataset.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_dataset" "gbif" {
dataset_id = "gbif"
project = var.project_id
description = "The Global Biodiversity Information Facility is an international network and data infrastructure funded by the world\u0027s governments and aimed at providing anyone, anywhere, open access to data about all types of life on Earth."
}

data "google_iam_policy" "bq_ds__gbif" {
dynamic "binding" {
for_each = var.iam_policies["bigquery_datasets"]["gbif"]
content {
role = binding.value["role"]
members = binding.value["members"]
}
}
}

resource "google_bigquery_dataset_iam_policy" "gbif" {
dataset_id = google_bigquery_dataset.gbif.dataset_id
policy_data = data.google_iam_policy.bq_ds__gbif.policy_data
}
output "bigquery_dataset-gbif-dataset_id" {
value = google_bigquery_dataset.gbif.dataset_id
}
34 changes: 34 additions & 0 deletions datasets/gbif/infra/gcs_to_bq_pipeline.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "gbif_occurrences" {
project = var.project_id
dataset_id = "gbif"
table_id = "occurrences"
description = "Evidence of the occurrence of a species (or other taxon) at a particular place on a specified date."
depends_on = [
google_bigquery_dataset.gbif
]
}

output "bigquery_table-gbif_occurrences-table_id" {
value = google_bigquery_table.gbif_occurrences.table_id
}

output "bigquery_table-gbif_occurrences-id" {
value = google_bigquery_table.gbif_occurrences.id
}
28 changes: 28 additions & 0 deletions datasets/gbif/infra/provider.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


provider "google" {
project = var.project_id
impersonate_service_account = var.impersonating_acct
region = var.region
}

data "google_client_openid_userinfo" "me" {}

output "impersonating-account" {
value = data.google_client_openid_userinfo.me.email
}
26 changes: 26 additions & 0 deletions datasets/gbif/infra/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


variable "project_id" {}
variable "bucket_name_prefix" {}
variable "impersonating_acct" {}
variable "region" {}
variable "env" {}
variable "iam_policies" {
default = {}
}

26 changes: 26 additions & 0 deletions datasets/gbif/pipelines/dataset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset:
name: gbif
friendly_name: gbif
description: ~
dataset_sources: ~
terms_of_use: ~

resources:
- type: bigquery_dataset
dataset_id: gbif
description: |-
The Global Biodiversity Information Facility is an international network and data infrastructure funded by the world's governments and aimed at providing anyone, anywhere, open access to data about all types of life on Earth.
47 changes: 47 additions & 0 deletions datasets/gbif/pipelines/gcs_to_bq/gcs_to_bq_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from airflow import DAG
from airflow.providers.google.cloud.transfers import gcs_to_bigquery

default_args = {
"owner": "Google",
"depends_on_past": False,
"start_date": "2022-03-31",
}


with DAG(
dag_id="gbif.gcs_to_bq",
default_args=default_args,
max_active_runs=1,
schedule_interval="0 0 2 * *",
catchup=False,
default_view="graph",
) as dag:

# Load Parquet files to BQ
load_parquet_files_to_bq = gcs_to_bigquery.GCSToBigQueryOperator(
task_id="load_parquet_files_to_bq",
bucket="{{ var.json.gbif.source_bucket }}",
source_objects=[
"occurrence/{{ execution_date.strftime('%Y-%m-01') }}/occurrence.parquet/*"
],
source_format="PARQUET",
destination_project_dataset_table="gbif.occurrences",
write_disposition="WRITE_TRUNCATE",
)

load_parquet_files_to_bq
47 changes: 47 additions & 0 deletions datasets/gbif/pipelines/gcs_to_bq/pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
resources:

- type: bigquery_table
table_id: occurrences
description: "Evidence of the occurrence of a species (or other taxon) at a particular place on a specified date."

dag:
airflow_version: 2
initialize:
dag_id: gcs_to_bq
default_args:
owner: "Google"
depends_on_past: False
start_date: "2022-03-31"
max_active_runs: 1
schedule_interval: "0 0 2 * *" # Every 2nd day of the month
catchup: False
default_view: graph

tasks:
- operator: "GoogleCloudStorageToBigQueryOperator"
description: "Load Parquet files to BQ"
args:
task_id: "load_parquet_files_to_bq"
bucket: "{{ var.json.gbif.source_bucket }}"
source_objects: ["occurrence/{{ execution_date.strftime('%Y-%m-01') }}/occurrence.parquet/*"]
source_format: "PARQUET"
destination_project_dataset_table: "gbif.occurrences"
write_disposition: "WRITE_TRUNCATE"

graph_paths:
- "load_parquet_files_to_bq"

0 comments on commit ab4e208

Please sign in to comment.