Skip to content

Commit

Permalink
Fix big query to mssql/mysql transfer issues (#20001)
Browse files Browse the repository at this point in the history
  • Loading branch information
jon-fearer committed Dec 30, 2021
1 parent ac9f29d commit af4a2b0
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 91 deletions.
9 changes: 8 additions & 1 deletion airflow/providers/google/cloud/hooks/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,14 @@ def get_tabledata(
:return: list of rows
"""
warnings.warn("This method is deprecated. Please use `list_rows`.", DeprecationWarning)
rows = self.list_rows(dataset_id, table_id, max_results, selected_fields, page_token, start_index)
rows = self.list_rows(
dataset_id=dataset_id,
table_id=table_id,
max_results=max_results,
selected_fields=selected_fields,
page_token=page_token,
start_index=start_index,
)
return [dict(r) for r in rows]

@GoogleBaseHook.fallback_to_default_project_id
Expand Down
69 changes: 27 additions & 42 deletions airflow/providers/google/cloud/transfers/bigquery_to_mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
# specific language governing permissions and limitations
# under the License.
"""This module contains Google BigQuery to MSSQL operator."""
from typing import TYPE_CHECKING, Optional, Sequence, Union

from google.cloud.bigquery.table import TableReference
from typing import TYPE_CHECKING, List, Optional, Sequence, Union

from airflow.models import BaseOperator
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
from airflow.providers.google.cloud.utils.bigquery_get_data import bigquery_get_data
from airflow.providers.microsoft.mssql.hooks.mssql import MsSqlHook

if TYPE_CHECKING:
Expand Down Expand Up @@ -57,7 +56,7 @@ class BigQueryToMsSqlOperator(BaseOperator):
:type source_project_dataset_table: str
:param selected_fields: List of fields to return (comma-separated). If
unspecified, all fields are returned.
:type selected_fields: str
:type selected_fields: List[str] | str
:param gcp_conn_id: reference to a specific Google Cloud hook.
:type gcp_conn_id: str
:param delegate_to: The account to impersonate using domain-wide delegation of authority,
Expand All @@ -82,7 +81,7 @@ class BigQueryToMsSqlOperator(BaseOperator):
If set as a sequence, the identities from the list must grant
Service Account Token Creator IAM role to the directly preceding identity, with first
account from the list granting this role to the originating account (templated).
:type impersonation_chain: Union[str, Sequence[str]]
:type impersonation_chain: str | Sequence[str]
"""

template_fields = ('source_project_dataset_table', 'mssql_table', 'impersonation_chain')
Expand All @@ -92,7 +91,7 @@ def __init__(
*,
source_project_dataset_table: str,
mssql_table: str,
selected_fields: Optional[str] = None,
selected_fields: Optional[Union[List[str], str]] = None,
gcp_conn_id: str = 'google_cloud_default',
mssql_conn_id: str = 'mssql_default',
database: Optional[str] = None,
Expand All @@ -114,47 +113,33 @@ def __init__(
self.batch_size = batch_size
self.location = location
self.impersonation_chain = impersonation_chain
try:
_, self.dataset_id, self.table_id = source_project_dataset_table.split('.')
except ValueError:
raise ValueError(
f'Could not parse {source_project_dataset_table} as <project>.<dataset>.<table>'
) from None
self.source_project_dataset_table = source_project_dataset_table

def _bq_get_data(self):

hook = BigQueryHook(
def execute(self, context: 'Context') -> None:
big_query_hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
location=self.location,
impersonation_chain=self.impersonation_chain,
)
table_ref = TableReference.from_string(self.source_project_dataset_table)
self.log.info('Fetching Data from:')
self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id)

conn = hook.get_conn()
cursor = conn.cursor()
i = 0
while True:
response = cursor.get_tabledata(
dataset_id=table_ref.dataset_id,
table_id=table_ref.table_id,
max_results=self.batch_size,
selected_fields=self.selected_fields,
start_index=i * self.batch_size,
)

if 'rows' not in response:
self.log.info('Job Finished')
return

rows = response['rows']

self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size)

table_data = []
table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows]

yield table_data
i += 1

def execute(self, context: 'Context'):
mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id, schema=self.database)
for rows in self._bq_get_data():
mssql_hook.insert_rows(self.mssql_table, rows, replace=self.replace)
for rows in bigquery_get_data(
self.log,
self.dataset_id,
self.table_id,
big_query_hook,
self.batch_size,
self.selected_fields,
):
mssql_hook.insert_rows(
table=self.mssql_table,
rows=rows,
target_fields=self.selected_fields,
replace=self.replace,
)
62 changes: 22 additions & 40 deletions airflow/providers/google/cloud/transfers/bigquery_to_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
# specific language governing permissions and limitations
# under the License.
"""This module contains Google BigQuery to MySQL operator."""
from typing import TYPE_CHECKING, Optional, Sequence, Union
from typing import TYPE_CHECKING, List, Optional, Sequence, Union

from airflow.models import BaseOperator
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
from airflow.providers.google.cloud.utils.bigquery_get_data import bigquery_get_data
from airflow.providers.mysql.hooks.mysql import MySqlHook

if TYPE_CHECKING:
Expand Down Expand Up @@ -54,7 +55,7 @@ class BigQueryToMySqlOperator(BaseOperator):
:type dataset_table: str
:param selected_fields: List of fields to return (comma-separated). If
unspecified, all fields are returned.
:type selected_fields: str
:type selected_fields: List[str] | str
:param gcp_conn_id: reference to a specific Google Cloud hook.
:type gcp_conn_id: str
:param delegate_to: The account to impersonate using domain-wide delegation of authority,
Expand All @@ -79,7 +80,7 @@ class BigQueryToMySqlOperator(BaseOperator):
If set as a sequence, the identities from the list must grant
Service Account Token Creator IAM role to the directly preceding identity, with first
account from the list granting this role to the originating account (templated).
:type impersonation_chain: Union[str, Sequence[str]]
:type impersonation_chain: str | Sequence[str]
"""

template_fields = (
Expand All @@ -94,7 +95,7 @@ def __init__(
*,
dataset_table: str,
mysql_table: str,
selected_fields: Optional[str] = None,
selected_fields: Optional[Union[List[str], str]] = None,
gcp_conn_id: str = 'google_cloud_default',
mysql_conn_id: str = 'mysql_default',
database: Optional[str] = None,
Expand All @@ -119,46 +120,27 @@ def __init__(
try:
self.dataset_id, self.table_id = dataset_table.split('.')
except ValueError:
raise ValueError(f'Could not parse {dataset_table} as <dataset>.<table>')
raise ValueError(f'Could not parse {dataset_table} as <dataset>.<table>') from None

def _bq_get_data(self):
self.log.info('Fetching Data from:')
self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id)

hook = BigQueryHook(
def execute(self, context: 'Context') -> None:
big_query_hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
location=self.location,
impersonation_chain=self.impersonation_chain,
)

i = 0
while True:
response = hook.list_rows(
dataset_id=self.dataset_id,
table_id=self.table_id,
max_results=self.batch_size,
selected_fields=self.selected_fields,
start_index=i * self.batch_size,
)
rows = [dict(r) for r in response]
if len(rows) == 0:
self.log.info('Job Finished')
return

self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size)

table_data = []
for dict_row in rows:
single_row = []
for fields in dict_row['f']:
single_row.append(fields['v'])
table_data.append(single_row)

yield table_data
i += 1

def execute(self, context: 'Context'):
mysql_hook = MySqlHook(schema=self.database, mysql_conn_id=self.mysql_conn_id)
for rows in self._bq_get_data():
mysql_hook.insert_rows(self.mysql_table, rows, replace=self.replace)
for rows in bigquery_get_data(
self.log,
self.dataset_id,
self.table_id,
big_query_hook,
self.batch_size,
self.selected_fields,
):
mysql_hook.insert_rows(
table=self.mysql_table,
rows=rows,
target_fields=self.selected_fields,
replace=self.replace,
)
56 changes: 56 additions & 0 deletions airflow/providers/google/cloud/utils/bigquery_get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from collections.abc import Iterator
from logging import Logger
from typing import List, Optional, Union

from google.cloud.bigquery.table import Row

from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook


def bigquery_get_data(
logger: Logger,
dataset_id: str,
table_id: str,
big_query_hook: BigQueryHook,
batch_size: int,
selected_fields: Optional[Union[List[str], str]],
) -> Iterator:
logger.info('Fetching Data from:')
logger.info('Dataset: %s ; Table: %s', dataset_id, table_id)

i = 0
while True:
rows: List[Row] = big_query_hook.list_rows(
dataset_id=dataset_id,
table_id=table_id,
max_results=batch_size,
selected_fields=selected_fields,
start_index=i * batch_size,
)

if len(rows) == 0:
logger.info('Job Finished')
return

logger.info('Total Extracted rows: %s', len(rows) + i * batch_size)

yield [row.values() for row in rows]

i += 1
1 change: 1 addition & 0 deletions docs/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ BaseView
BaseXCom
Beauchemin
Behaviour
BigQueryHook
Bigquery
Bigtable
Bitshift
Expand Down
15 changes: 7 additions & 8 deletions tests/providers/google/cloud/transfers/test_bigquery_to_mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,11 @@ def test_execute_good_request_to_bq(self, mock_hook):

operator.execute(None)
# fmt: off
mock_hook.return_value.get_conn.return_value.cursor.return_value.get_tabledata\
.assert_called_once_with(
dataset_id=TEST_DATASET,
table_id=TEST_TABLE_ID,
max_results=1000,
selected_fields=None,
start_index=0,
)
mock_hook.return_value.list_rows.assert_called_once_with(
dataset_id=TEST_DATASET,
table_id=TEST_TABLE_ID,
max_results=1000,
selected_fields=None,
start_index=0,
)
# fmt: on

0 comments on commit af4a2b0

Please sign in to comment.