From ab0cf4cc03292f62b56a8813cfb7681daa87f872 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 12 Feb 2024 13:46:25 -0600 Subject: [PATCH] feat: support nullable boolean and Int64 dtypes in `insert_rows_from_dataframe` (#1816) --- google/cloud/bigquery/_pandas_helpers.py | 19 +++++++ tests/system/test_pandas.py | 13 ++++- tests/unit/test__pandas_helpers.py | 65 +++++++++++++++++------- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index bcc869f15..e97dda7e5 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -958,6 +958,25 @@ def dataframe_to_json_generator(dataframe): # considered a NaN, however. if isinstance(is_nan, bool) and is_nan: continue + + # Convert numpy types to corresponding Python types. + # https://stackoverflow.com/a/60441783/101923 + if isinstance(value, numpy.bool_): + value = bool(value) + elif isinstance( + value, + ( + numpy.int64, + numpy.int32, + numpy.int16, + numpy.int8, + numpy.uint64, + numpy.uint32, + numpy.uint16, + numpy.uint8, + ), + ): + value = int(value) output[column] = value yield output diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index e93f245c0..85c7b79e6 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -835,7 +835,9 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id): schema = [ SF("float_col", "FLOAT", mode="REQUIRED"), SF("int_col", "INTEGER", mode="REQUIRED"), + SF("int64_col", "INTEGER", mode="NULLABLE"), SF("bool_col", "BOOLEAN", mode="REQUIRED"), + SF("boolean_col", "BOOLEAN", mode="NULLABLE"), SF("string_col", "STRING", mode="NULLABLE"), SF("date_col", "DATE", mode="NULLABLE"), SF("time_col", "TIME", mode="NULLABLE"), @@ -898,6 +900,15 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id): dataframe["date_col"] = dataframe["date_col"].astype("dbdate") dataframe["time_col"] = dataframe["time_col"].astype("dbtime") + # Support nullable integer and boolean dtypes. + # https://github.com/googleapis/python-bigquery/issues/1815 + dataframe["int64_col"] = pandas.Series( + [-11, -22, pandas.NA, -44, -55, -66], dtype="Int64" + ) + dataframe["boolean_col"] = pandas.Series( + [True, False, True, pandas.NA, True, False], dtype="boolean" + ) + table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" table_arg = bigquery.Table(table_id, schema=schema) table = helpers.retry_403(bigquery_client.create_table)(table_arg) @@ -910,7 +921,7 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id): expected = [ # Pandas often represents NULL values as NaN. Convert to None for # easier comparison. - tuple(None if col != col else col for col in data_row) + tuple(None if pandas.isna(col) else col for col in data_row) for data_row in dataframe.itertuples(index=False) ] diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index ad40a6da6..7c83d3ec5 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -808,29 +808,60 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name( @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_json_generator(module_under_test): utcnow = datetime.datetime.utcnow() - df_data = collections.OrderedDict( - [ - ("a_series", [pandas.NA, 2, 3, 4]), - ("b_series", [0.1, float("NaN"), 0.3, 0.4]), - ("c_series", ["a", "b", pandas.NA, "d"]), - ("d_series", [utcnow, utcnow, utcnow, pandas.NaT]), - ("e_series", [True, False, True, None]), - ] - ) dataframe = pandas.DataFrame( - df_data, index=pandas.Index([4, 5, 6, 7], name="a_index") + { + "a_series": [1, 2, 3, 4], + "b_series": [0.1, float("NaN"), 0.3, 0.4], + "c_series": ["a", "b", pandas.NA, "d"], + "d_series": [utcnow, utcnow, utcnow, pandas.NaT], + "e_series": [True, False, True, None], + # Support nullable dtypes. + # https://github.com/googleapis/python-bigquery/issues/1815 + "boolean_series": pandas.Series( + [True, False, pandas.NA, False], dtype="boolean" + ), + "int64_series": pandas.Series([-1, pandas.NA, -3, -4], dtype="Int64"), + } ) - dataframe = dataframe.astype({"a_series": pandas.Int64Dtype()}) + # Index is not included, even if it is not the default and has a name. + dataframe = dataframe.rename(index=lambda idx: idx + 4) + dataframe.index.name = "a_index" - rows = module_under_test.dataframe_to_json_generator(dataframe) + rows = list(module_under_test.dataframe_to_json_generator(dataframe)) expected = [ - {"b_series": 0.1, "c_series": "a", "d_series": utcnow, "e_series": True}, - {"a_series": 2, "c_series": "b", "d_series": utcnow, "e_series": False}, - {"a_series": 3, "b_series": 0.3, "d_series": utcnow, "e_series": True}, - {"a_series": 4, "b_series": 0.4, "c_series": "d"}, + { + "a_series": 1, + "b_series": 0.1, + "c_series": "a", + "d_series": utcnow, + "e_series": True, + "boolean_series": True, + "int64_series": -1, + }, + { + "a_series": 2, + "c_series": "b", + "d_series": utcnow, + "e_series": False, + "boolean_series": False, + }, + { + "a_series": 3, + "b_series": 0.3, + "d_series": utcnow, + "e_series": True, + "int64_series": -3, + }, + { + "a_series": 4, + "b_series": 0.4, + "c_series": "d", + "boolean_series": False, + "int64_series": -4, + }, ] - assert list(rows) == expected + assert rows == expected @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")