feat: support nullable boolean and Int64 dtypes in `insert_rows_from_…

…dataframe`
googleapis · Linchin · Feb 12, 2024 · Feb 9, 2024 · Feb 9, 2024 · e1c39c60e4c84150e37628889985aff0191b23fd
commit e1c39c60e4c84150e37628889985aff0191b23fd
@@ -958,6 +958,25 @@ def dataframe_to_json_generator(dataframe):
  # considered a NaN, however.
  if isinstance(is_nan, bool) and is_nan:
  continue
+
+ # Convert numpy types to corresponding Python types.
+ # https://stackoverflow.com/a/60441783/101923
+ if isinstance(value, numpy.bool_):
+ value = bool(value)
+ elif isinstance(
+ value,
+ (
+ numpy.int64,
+ numpy.int32,
+ numpy.int16,
+ numpy.int8,
+ numpy.uint64,
+ numpy.uint32,
+ numpy.uint16,
+ numpy.uint8,
+ ),
+ ):
+ value = int(value)
  output[column] = value
 
  yield output

@@ -835,7 +835,9 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
  schema = [
  SF("float_col", "FLOAT", mode="REQUIRED"),
  SF("int_col", "INTEGER", mode="REQUIRED"),
+ SF("int64_col", "INTEGER", mode="NULLABLE"),
  SF("bool_col", "BOOLEAN", mode="REQUIRED"),
+ SF("boolean_col", "BOOLEAN", mode="NULLABLE"),
  SF("string_col", "STRING", mode="NULLABLE"),
  SF("date_col", "DATE", mode="NULLABLE"),
  SF("time_col", "TIME", mode="NULLABLE"),
@@ -898,6 +900,15 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
  dataframe["date_col"] = dataframe["date_col"].astype("dbdate")
  dataframe["time_col"] = dataframe["time_col"].astype("dbtime")
 
+ # Support nullable integer and boolean dtypes.
+ # https://github.com/googleapis/python-bigquery/issues/1815
+ dataframe["int64_col"] = pandas.Series(
+ [-11, -22, pandas.NA, -44, -55, -66], dtype="Int64"
+ )
+ dataframe["boolean_col"] = pandas.Series(
+ [True, False, True, pandas.NA, True, False], dtype="boolean"
+ )
+
  table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe"
  table_arg = bigquery.Table(table_id, schema=schema)
  table = helpers.retry_403(bigquery_client.create_table)(table_arg)
@@ -910,7 +921,7 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
  expected = [
  # Pandas often represents NULL values as NaN. Convert to None for
  # easier comparison.
- tuple(None if col != col else col for col in data_row)
+ tuple(None if pandas.isna(col) else col for col in data_row)
  for data_row in dataframe.itertuples(index=False)
  ]
 

@@ -808,29 +808,60 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name(
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_dataframe_to_json_generator(module_under_test):
  utcnow = datetime.datetime.utcnow()
- df_data = collections.OrderedDict(
- [
- ("a_series", [pandas.NA, 2, 3, 4]),
- ("b_series", [0.1, float("NaN"), 0.3, 0.4]),
- ("c_series", ["a", "b", pandas.NA, "d"]),
- ("d_series", [utcnow, utcnow, utcnow, pandas.NaT]),
- ("e_series", [True, False, True, None]),
- ]
- )
  dataframe = pandas.DataFrame(
- df_data, index=pandas.Index([4, 5, 6, 7], name="a_index")
+ {
+ "a_series": [1, 2, 3, 4],
+ "b_series": [0.1, float("NaN"), 0.3, 0.4],
+ "c_series": ["a", "b", pandas.NA, "d"],
+ "d_series": [utcnow, utcnow, utcnow, pandas.NaT],
+ "e_series": [True, False, True, None],
+ # Support nullable dtypes.
+ # https://github.com/googleapis/python-bigquery/issues/1815
+ "boolean_series": pandas.Series(
+ [True, False, pandas.NA, False], dtype="boolean"
+ ),
+ "int64_series": pandas.Series([-1, pandas.NA, -3, -4], dtype="Int64"),
+ }
  )
 
- dataframe = dataframe.astype({"a_series": pandas.Int64Dtype()})
+ # Index is not included, even if it is not the default and has a name.
+ dataframe = dataframe.rename(index=lambda idx: idx + 4)
+ dataframe.index.name = "a_index"
 
- rows = module_under_test.dataframe_to_json_generator(dataframe)
+ rows = list(module_under_test.dataframe_to_json_generator(dataframe))
  expected = [
- {"b_series": 0.1, "c_series": "a", "d_series": utcnow, "e_series": True},
- {"a_series": 2, "c_series": "b", "d_series": utcnow, "e_series": False},
- {"a_series": 3, "b_series": 0.3, "d_series": utcnow, "e_series": True},
- {"a_series": 4, "b_series": 0.4, "c_series": "d"},
+ {
+ "a_series": 1,
+ "b_series": 0.1,
+ "c_series": "a",
+ "d_series": utcnow,
+ "e_series": True,
+ "boolean_series": True,
+ "int64_series": -1,
+ },
+ {
+ "a_series": 2,
+ "c_series": "b",
+ "d_series": utcnow,
+ "e_series": False,
+ "boolean_series": False,
+ },
+ {
+ "a_series": 3,
+ "b_series": 0.3,
+ "d_series": utcnow,
+ "e_series": True,
+ "int64_series": -3,
+ },
+ {
+ "a_series": 4,
+ "b_series": 0.4,
+ "c_series": "d",
+ "boolean_series": False,
+ "int64_series": -4,
+ },
  ]
- assert list(rows) == expected
+ assert rows == expected
 
 
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")