Use python client in BQ hook create_empty_table/dataset and table_exi…

…sts (#8377) * Use python client in BQ hook create_empty_table method * Refactor table_exists and create_empty_dataset * Add note in UPDATING
apache · Apr 22, 2020 · 57c8c05 · 57c8c05
1 parent 93ea058
commit 57c8c05
Show file tree

Hide file tree

Showing 4 changed files with 202 additions and 258 deletions.
diff --git a/UPDATING.md b/UPDATING.md
@@ -62,6 +62,14 @@ https://developers.google.com/style/inclusive-documentation
 
 -->
 
+### Changes in BigQueryHook
+- `create_empty_table` method accepts now `table_resource` parameter. If provided all
+other parameters are ignored.
+- `create_empty_dataset` will now use values from `dataset_reference` instead of raising error
+if parameters were passed in `dataset_reference` and as arguments to method. Additionally validation
+of `dataset_reference` is done using `Dataset.from_api_repr`. Exception and log messages has been
+changed.
+
 ### Added mypy plugin to preserve types of decorated functions
 
 Mypy currently doesn't support precise type information for decorated

diff --git a/airflow/providers/google/cloud/example_dags/example_bigquery.py b/airflow/providers/google/cloud/example_dags/example_bigquery.py
@@ -219,10 +219,10 @@
  # [START howto_operator_bigquery_create_view]
  create_view = BigQueryCreateEmptyTableOperator(
  task_id="create_view",
- dataset_id=LOCATION_DATASET_NAME,
+ dataset_id=DATASET_NAME,
  table_id="test_view",
  view={
- "query": "SELECT * FROM `{}.test_table`".format(DATASET_NAME),
+ "query": f"SELECT * FROM `{PROJECT_ID}.{DATASET_NAME}.test_table`",
  "useLegacySql": False
  }
  )

diff --git a/airflow/providers/google/cloud/hooks/bigquery.py b/airflow/providers/google/cloud/hooks/bigquery.py
@@ -26,6 +26,9 @@
 from copy import deepcopy
 from typing import Any, Dict, Iterable, List, Mapping, NoReturn, Optional, Tuple, Type, Union
 
+from google.api_core.retry import Retry
+from google.cloud.bigquery import DEFAULT_RETRY, Client, Dataset, Table
+from google.cloud.exceptions import NotFound
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 from pandas import DataFrame
@@ -38,6 +41,7 @@
 from airflow.exceptions import AirflowException
 from airflow.hooks.dbapi_hook import DbApiHook
 from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
+from airflow.utils.helpers import convert_camel_to_snake
 from airflow.utils.log.logging_mixin import LoggingMixin
 
 log = logging.getLogger(__name__)
@@ -136,7 +140,8 @@ def get_pandas_df(
  verbose=False,
  credentials=credentials)
 
- def table_exists(self, project_id: str, dataset_id: str, table_id: str) -> bool:
+ @GoogleBaseHook.fallback_to_default_project_id
+ def table_exists(self, dataset_id: str, table_id: str, project_id: str) -> bool:
  """
  Checks for the existence of a table in Google BigQuery.
 
@@ -150,28 +155,30 @@ def table_exists(self, project_id: str, dataset_id: str, table_id: str) -> bool:
  :param table_id: The name of the table to check the existence of.
  :type table_id: str
  """
- service = self.get_service()
+ table_reference = f"{project_id}.{dataset_id}.{table_id}"
+
  try:
- service.tables().get( # pylint: disable=no-member
- projectId=project_id, datasetId=dataset_id,
- tableId=table_id).execute(num_retries=self.num_retries)
+ Client(client_info=self.client_info).get_table(table_reference)
  return True
- except HttpError as e:
- if e.resp['status'] == '404':
- return False
- raise
-
- def create_empty_table(self, # pylint: disable=too-many-arguments
- project_id: str,
- dataset_id: str,
- table_id: str,
- schema_fields: Optional[List] = None,
- time_partitioning: Optional[Dict] = None,
- cluster_fields: Optional[List[str]] = None,
- labels: Optional[Dict] = None,
- view: Optional[Dict] = None,
- encryption_configuration: Optional[Dict] = None,
- num_retries: int = 5) -> None:
+ except NotFound:
+ return False
+
+ @GoogleBaseHook.fallback_to_default_project_id
+ def create_empty_table( # pylint: disable=too-many-arguments
+ self,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ table_resource: Optional[Dict[str, Any]] = None,
+ schema_fields: Optional[List] = None,
+ time_partitioning: Optional[Dict] = None,
+ cluster_fields: Optional[List[str]] = None,
+ labels: Optional[Dict] = None,
+ view: Optional[Dict] = None,
+ encryption_configuration: Optional[Dict] = None,
+ retry: Optional[Retry] = DEFAULT_RETRY,
+ num_retries: Optional[int] = None
+ ) -> None:
  """
  Creates a new, empty table in the dataset.
  To create a view, which is defined by a SQL query, parse a dictionary to 'view' kwarg
@@ -182,11 +189,17 @@ def create_empty_table(self, # pylint: disable=too-many-arguments
  :type dataset_id: str
  :param table_id: The Name of the table to be created.
  :type table_id: str
+ :param table_resource: Table resource as described in documentation:
+ https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
+ If provided all other parameters are ignored.
+ :type table_resource: Dict[str, Any]
  :param schema_fields: If set, the schema field list as defined here:
  https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
  :type schema_fields: list
  :param labels: a dictionary containing labels for the table, passed to BigQuery
  :type labels: dict
+ :param retry: Optional. How to retry the RPC.
+ :type retry: google.api_core.retry.Retry
 
  **Example**: ::
 
@@ -227,119 +240,97 @@ def create_empty_table(self, # pylint: disable=too-many-arguments
  :type num_retries: int
  :return: None
  """
- service = self.get_service()
-
- project_id = project_id if project_id is not None else self.project_id
+ if num_retries:
+ warnings.warn("Parameter `num_retries` is deprecated", DeprecationWarning)
 
- table_resource = {
+ _table_resource: Dict[str, Any] = {
  'tableReference': {
- 'tableId': table_id
+ 'tableId': table_id,
+ 'projectId': project_id,
+ 'datasetId': dataset_id,
  }
- } # type: Dict[str, Any]
+ }
 
  if self.location:
- table_resource['location'] = self.location
+ _table_resource['location'] = self.location
 
  if schema_fields:
- table_resource['schema'] = {'fields': schema_fields}
+ _table_resource['schema'] = {'fields': schema_fields}
 
  if time_partitioning:
- table_resource['timePartitioning'] = time_partitioning
+ _table_resource['timePartitioning'] = time_partitioning
 
  if cluster_fields:
- table_resource['clustering'] = {
+ _table_resource['clustering'] = {
  'fields': cluster_fields
  }
 
  if labels:
- table_resource['labels'] = labels
+ _table_resource['labels'] = labels
 
  if view:
- table_resource['view'] = view
+ _table_resource['view'] = view
 
  if encryption_configuration:
- table_resource["encryptionConfiguration"] = encryption_configuration
+ _table_resource["encryptionConfiguration"] = encryption_configuration
 
- num_retries = num_retries if num_retries else self.num_retries
-
- service.tables().insert( # pylint: disable=no-member
- projectId=project_id,
- datasetId=dataset_id,
- body=table_resource).execute(num_retries=num_retries)
+ table_resource = table_resource or _table_resource
+ table = Table.from_api_repr(table_resource)
+ Client(client_info=self.client_info).create_table(table=table, exists_ok=True, retry=retry)
 
+ @GoogleBaseHook.fallback_to_default_project_id
  def create_empty_dataset(self,
- dataset_id: str = "",
- project_id: str = "",
+ dataset_id: Optional[str] = None,
+ project_id: Optional[str] = None,
  location: Optional[str] = None,
- dataset_reference: Optional[Dict] = None) -> None:
+ dataset_reference: Optional[Dict[str, Any]] = None) -> None:
  """
  Create a new empty dataset:
  https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/insert
 
  :param project_id: The name of the project where we want to create
  an empty a dataset. Don't need to provide, if projectId in dataset_reference.
  :type project_id: str
- :param dataset_id: The id of dataset. Don't need to provide,
- if datasetId in dataset_reference.
+ :param dataset_id: The id of dataset. Don't need to provide, if datasetId in dataset_reference.
  :type dataset_id: str
  :param location: (Optional) The geographic location where the dataset should reside.
  There is no default value but the dataset will be created in US if nothing is provided.
  :type location: str
- :param dataset_reference: Dataset reference that could be provided
- with request body. More info:
+ :param dataset_reference: Dataset reference that could be provided with request body. More info:
  https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource
  :type dataset_reference: dict
  """
- service = self.get_service()
-
- if dataset_reference:
- _validate_value('dataset_reference', dataset_reference, dict)
- else:
- dataset_reference = {}
 
- if "datasetReference" not in dataset_reference:
- dataset_reference["datasetReference"] = {}
+ dataset_reference = dataset_reference or {"datasetReference": {}}
 
- if self.location:
- dataset_reference['location'] = dataset_reference.get('location') or self.location
-
- if not dataset_reference["datasetReference"].get("datasetId") and not dataset_id:
- raise ValueError(
- "dataset_id not provided and datasetId not exist in the datasetReference. "
- "Impossible to create dataset")
-
- dataset_required_params = [(dataset_id, "datasetId", ""),
- (project_id, "projectId", self.project_id)]
- for param_tuple in dataset_required_params:
- param, param_name, param_default = param_tuple
- if param_name not in dataset_reference['datasetReference']:
- if param_default and not param:
+ for param, value in zip(["datasetId", "projectId"], [dataset_id, project_id]):
+ specified_param = dataset_reference["datasetReference"].get(param)
+ if specified_param:
+ if value:
  self.log.info(
- "%s was not specified. Will be used default value %s.",
- param_name, param_default
+ "`%s` was provided in both `dataset_reference` and as `%s`. "
+ "Using value from `dataset_reference`",
+ param, convert_camel_to_snake(param)
  )
- param = param_default
- dataset_reference['datasetReference'].update(
- {param_name: param})
- elif param:
- _api_resource_configs_duplication_check(
- param_name, param,
- dataset_reference['datasetReference'], 'dataset_reference')
+ continue # use specified value
+ if not value:
+ raise ValueError(
+ f"Please specify `{param}` either in `dataset_reference` "
+ f"or by providing `{convert_camel_to_snake(param)}`",
+ )
+ # dataset_reference has no param but we can fallback to default value
+ self.log.info(
+ "%s was not specified in `dataset_reference`. Will use default value %s.",
+ param, value
+ )
+ dataset_reference["datasetReference"][param] = value
 
+ location = location or self.location
  if location:
- if 'location' not in dataset_reference:
- dataset_reference.update({'location': location})
- else:
- _api_resource_configs_duplication_check(
- 'location', location,
- dataset_reference, 'dataset_reference')
+ dataset_reference["location"] = dataset_reference.get("location", location)
 
- dataset_id = dataset_reference.get("datasetReference").get("datasetId") # type: ignore
- dataset_project_id = dataset_reference.get("datasetReference").get("projectId") # type: ignore
-
- service.datasets().insert( # pylint: disable=no-member
- projectId=dataset_project_id,
- body=dataset_reference).execute(num_retries=self.num_retries)
+ dataset = Dataset.from_api_repr(dataset_reference)
+ Client(client_info=self.client_info).create_dataset(dataset=dataset, exists_ok=True)
 
  def get_dataset_tables(self, dataset_id: str, project_id: Optional[str] = None,
  max_results: Optional[int] = None,