Skip to content

Commit

Permalink
Feat: Extract the tabular metadata for Cloud Datasets program (#452)
Browse files Browse the repository at this point in the history
  • Loading branch information
happyhuman committed Sep 9, 2022
1 parent 2ecd9ea commit 1a3d59e
Show file tree
Hide file tree
Showing 10 changed files with 1,017 additions and 0 deletions.
26 changes: 26 additions & 0 deletions datasets/cloud_datasets/infra/cloud_datasets_dataset.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_dataset" "_cloud_datasets" {
dataset_id = "_cloud_datasets"
project = var.project_id
description = "A dataset dedicated to Google Cloud Datasets Program and its metadata (not a public dataset)"
}

output "bigquery_dataset-_cloud_datasets-dataset_id" {
value = google_bigquery_dataset._cloud_datasets.dataset_id
}
232 changes: 232 additions & 0 deletions datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/**
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "_cloud_datasets_tabular_datasets" {
project = var.project_id
dataset_id = "_cloud_datasets"
table_id = "tabular_datasets"
description = "This table contains all the metadata for all the tabular datasets in the Cloud Datasets program"
schema = <<EOF
[
{
"name": "extracted_at",
"description": "The date and time when this row was extracted from BigQuery",
"type": "TIMESTAMP"
},
{
"name": "created_at",
"description": "The date and time when the dataset was created",
"type": "TIMESTAMP"
},
{
"name": "modified_at",
"description": "The date and time when the dataset was last modified",
"type": "TIMESTAMP"
},
{
"name": "project_id",
"description": "The GCP project where the public dataset is stored",
"type": "STRING"
},
{
"name": "dataset_id",
"description": "The BigQuery dataset ID",
"type": "STRING"
},
{
"name": "description",
"description": "The dataset description",
"type": "STRING"
},
{
"name": "num_tables",
"description": "Number of tables contained in this dataset",
"type": "INTEGER"
}
]
EOF
depends_on = [
google_bigquery_dataset._cloud_datasets
]
}

output "bigquery_table-_cloud_datasets_tabular_datasets-table_id" {
value = google_bigquery_table._cloud_datasets_tabular_datasets.table_id
}

output "bigquery_table-_cloud_datasets_tabular_datasets-id" {
value = google_bigquery_table._cloud_datasets_tabular_datasets.id
}

resource "google_bigquery_table" "_cloud_datasets_tables" {
project = var.project_id
dataset_id = "_cloud_datasets"
table_id = "tables"
description = "This table contains all the metadata for all the tables in the Cloud Datasets program"
schema = <<EOF
[
{
"name": "extracted_at",
"description": "The date and time when this row was extracted from BigQuery",
"type": "TIMESTAMP"
},
{
"name": "created_at",
"description": "The date and time when the dataset was created",
"type": "TIMESTAMP"
},
{
"name": "modified_at",
"description": "The date and time when the dataset was last modified",
"type": "TIMESTAMP"
},
{
"name": "project_id",
"description": "The GCP project where the public dataset is stored",
"type": "STRING"
},
{
"name": "dataset_id",
"description": "The BigQuery dataset ID",
"type": "STRING"
},
{
"name": "table_id",
"description": "The BigQuery table ID",
"type": "STRING"
},
{
"name": "description",
"description": "The dataset description",
"type": "STRING"
},
{
"name": "type",
"description": "The type of the table",
"type": "STRING"
},
{
"name": "num_bytes",
"description": "The number of bytes the table allocated on disk",
"type": "INTEGER"
},
{
"name": "num_rows",
"description": "The number of rows in the table",
"type": "INTEGER"
},
{
"name": "num_columns",
"description": "The number of columns in the table",
"type": "INTEGER"
},
{
"name": "described_columns",
"description": "The number of columns in the table with a description",
"type": "INTEGER"
}
]
EOF
depends_on = [
google_bigquery_dataset._cloud_datasets
]
}

output "bigquery_table-_cloud_datasets_tables-table_id" {
value = google_bigquery_table._cloud_datasets_tables.table_id
}

output "bigquery_table-_cloud_datasets_tables-id" {
value = google_bigquery_table._cloud_datasets_tables.id
}

resource "google_bigquery_table" "_cloud_datasets_tables_fields" {
project = var.project_id
dataset_id = "_cloud_datasets"
table_id = "tables_fields"
description = "This table contains all the metadata for all the field in all the tables in the Cloud Datasets program"
schema = <<EOF
[
{
"name": "extracted_at",
"description": "The date and time when this row was extracted from BigQuery",
"type": "TIMESTAMP"
},
{
"name": "project_id",
"description": "The GCP project where the public dataset is stored",
"type": "STRING"
},
{
"name": "dataset_id",
"description": "The BigQuery dataset ID",
"type": "STRING"
},
{
"name": "table_id",
"description": "The BigQuery table ID",
"type": "STRING"
},
{
"name": "name",
"description": "The name of the field",
"type": "STRING"
},
{
"name": "description",
"description": "The description for the field",
"type": "STRING"
},
{
"name": "field_type",
"description": "The type of the field",
"type": "STRING"
},
{
"name": "mode",
"description": "The mode of the field",
"type": "STRING"
},
{
"name": "precision",
"description": "Precision for the NUMERIC field",
"type": "INTEGER"
},
{
"name": "scale",
"description": "Scale for the NUMERIC field",
"type": "INTEGER"
},
{
"name": "max_length",
"description": "Maximum length for the STRING or BYTES field",
"type": "INTEGER"
}
]
EOF
depends_on = [
google_bigquery_dataset._cloud_datasets
]
}

output "bigquery_table-_cloud_datasets_tables_fields-table_id" {
value = google_bigquery_table._cloud_datasets_tables_fields.table_id
}

output "bigquery_table-_cloud_datasets_tables_fields-id" {
value = google_bigquery_table._cloud_datasets_tables_fields.id
}
27 changes: 27 additions & 0 deletions datasets/cloud_datasets/infra/provider.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/**
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


provider "google" {
project = var.project_id
region = var.region
}

data "google_client_openid_userinfo" "me" {}

output "impersonating-account" {
value = data.google_client_openid_userinfo.me.email
}
26 changes: 26 additions & 0 deletions datasets/cloud_datasets/infra/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


variable "project_id" {}
variable "bucket_name_prefix" {}
variable "impersonating_acct" {}
variable "region" {}
variable "env" {}
variable "iam_policies" {
default = {}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The base image for this build
# FROM gcr.io/google.com/cloudsdktool/cloud-sdk:slim
FROM python:3.8

# Allow statements and log messages to appear in Cloud logs
ENV PYTHONUNBUFFERED True

# Copy the requirements file into the image
COPY requirements.txt ./

# Install the packages specified in the requirements file
RUN python3 -m pip install --no-cache-dir -r requirements.txt

# The WORKDIR instruction sets the working directory for any RUN, CMD,
# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
# any subsequent Dockerfile instruction
WORKDIR /custom

# Copy the specific data processing script/s in the image under /custom/*
COPY ./script.py .

# Command to run the data processing script when the container is run
CMD ["python3", "script.py"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pandas
google-cloud-storage
google-cloud-bigquery

0 comments on commit 1a3d59e

Please sign in to comment.