Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: avoid unnecessary API call in QueryJob.result() when job is already finished #1900

Merged
merged 14 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
103 changes: 79 additions & 24 deletions google/cloud/bigquery/job/query.py
Expand Up @@ -17,6 +17,7 @@
import concurrent.futures
import copy
import re
import time
import typing
from typing import Any, Dict, Iterable, List, Optional, Union

Expand Down Expand Up @@ -1439,6 +1440,12 @@ def _done_or_raise(self, retry=DEFAULT_RETRY, timeout=None):
# stored in _blocking_poll() in the process of polling for job completion.
transport_timeout = timeout if timeout is not None else self._transport_timeout

# We could have been given a generic object() sentinel to represent a
# default timeout.
transport_timeout = (
transport_timeout if isinstance(timeout, (int, float)) else None
)

try:
self._reload_query_results(retry=retry, timeout=transport_timeout)
except exceptions.GoogleAPIError as exc:
Expand Down Expand Up @@ -1528,6 +1535,10 @@ def result( # type: ignore # (incompatible with supertype)
If Non-``None`` and non-default ``job_retry`` is
provided and the job is not retryable.
"""
# Note: Since waiting for a query job to finish is more complex than
# refreshing the job state in a loop, we avoid calling the superclass
# in this method.

if self.dry_run:
return _EmptyRowIterator(
project=self.project,
Expand All @@ -1548,46 +1559,90 @@ def result( # type: ignore # (incompatible with supertype)
" provided to the query that created this job."
)

first = True
restart_query_job = False

def do_get_result():
nonlocal first
def is_job_done():
nonlocal restart_query_job

if first:
first = False
else:
# Note that we won't get here if retry_do_query is
# None, because we won't use a retry.
if restart_query_job:
restart_query_job = False

# The orinal job is failed. Create a new one.
tswast marked this conversation as resolved.
Show resolved Hide resolved
#
# Note that we won't get here if retry_do_query is
# None, because we won't use a retry.
job = retry_do_query()

# If it's already failed, we might as well stop:
if job.done() and job.exception() is not None:
raise job.exception()

# Become the new job:
self.__dict__.clear()
self.__dict__.update(job.__dict__)

# This shouldn't be necessary, because once we have a good
# job, it should stay good,and we shouldn't have to retry.
# But let's be paranoid. :)
# It's possible the job fails again and we'll have to
# retry that too.
self._retry_do_query = retry_do_query
self._job_retry = job_retry

super(QueryJob, self).result(retry=retry, timeout=timeout)

# Since the job could already be "done" (e.g. got a finished job
# via client.get_job), the superclass call to done() might not
# set the self._query_results cache.
if self._query_results is None or not self._query_results.complete:
self._reload_query_results(retry=retry, timeout=timeout)
# Refresh the job status with jobs.get because some of the
# exceptions thrown by jobs.getQueryResults like timeout and
# rateLimitExceeded errors are ambiguous. We want to know if
# the query job failed and not just the call to
# jobs.getQueryResults.
if self.done(retry=retry, timeout=timeout):
# If it's already failed, we might as well stop.
if self.exception() is not None:
# Only try to restart the query job if the job failed for
# a retriable reason. For example, don't restart the query
# if the call to reload the job metadata within self.done()
# timed out.
restart_query_job = True
raise self.exception()
else:
# Optimization: avoid a call to jobs.getQueryResults if
# it's already been fetched, e.g. from jobs.query first
# page of results.
if (
self._query_results is None
or not self._query_results.complete
):
self._reload_query_results(retry=retry, timeout=timeout)
return True

# Call jobs.getQueryResults with max results set to 0 just to
# wait for the query to finish. Unlike most methods,
# jobs.getQueryResults hangs as long as it can to ensure we
# know when the query has finished as soon as possible.
self._reload_query_results(retry=retry, timeout=timeout)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh oh, if jobs.getQueryResults fails because the job failed it can throw an exception but restart_query_job will still be False.

But we don't want restart_query_job = True because sometimes this can raise an ambiguous exception such as quota exceeded, where we don't know if it's the job quota and it's a failed job or at a higher level (Google Frontend - GFE) where the job might actually still be running and/or succeeded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't the worst way to fail, but it'd be nice to do the jobs.get call above in case of an exception to get a chance at retrying this job if the job failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #1903 to track improvements to ambiguous errors. 12fa9fb fixes an issue where we weren't actually retrying after an ambiguous failure even though we thought we were.


# Even if the query is finished now according to
# jobs.getQueryResults, we'll want to reload the job status if
# it's not already DONE.
return False

if retry_do_query is not None and job_retry is not None:
do_get_result = job_retry(do_get_result)
is_job_done = job_retry(is_job_done)

do_get_result()
# timeout can be `None` or an object from our superclass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which superclass are we discussing here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

google.api_core.future.polling.PollingFuture._DEFAULT_VALUE introduced in googleapis/python-api-core#462.

I've updated the comments with some more info as well as some things to consider in case we want to have a default value for timeout in future.

# indicating a default timeout.
remaining_timeout = timeout if isinstance(timeout, (float, int)) else None

if remaining_timeout is None:
# Since is_job_done() calls jobs.getQueryResults, which is a
# long-running API, don't delay the next request at all.
while not is_job_done():
pass
else:
# Use a monotonic clock since we don't actually care about
# daylight savings or similar, just the elapsed time.
previous_time = time.monotonic()

while not is_job_done():
current_time = time.monotonic()
elapsed_time = current_time - previous_time
remaining_timeout = remaining_timeout - elapsed_time
previous_time = current_time

if remaining_timeout < 0:
raise concurrent.futures.TimeoutError()

except exceptions.GoogleAPICallError as exc:
exc.message = _EXCEPTION_FOOTER_TEMPLATE.format(
Expand Down
64 changes: 30 additions & 34 deletions tests/unit/job/test_query.py
Expand Up @@ -970,7 +970,12 @@ def test_result(self):
"rows": [{"f": [{"v": "abc"}]}],
}
conn = make_connection(
query_resource, query_resource_done, job_resource_done, query_page_resource
job_resource,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I am tracking the relationship between the make connection inputs versus the assert_has_calls checks.

Can you explain how these tests are supposed to work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_connection is a convention in google-cloud-bigquery unit tests that actually predates our use of the "mock" package. It mocks out the responses to REST API calls, previously with a fake implementation of our "Connection" class from the _http module and now with a true mock object. For every quest that our test makes, there should be a corresponding response. As with Mock.side_effect, any exceptions in this list will be raised, instead.

I'm guessing your question also relates to "Why this particular set of requests / responses?". I've added some comments explaining why we're expecting this sequence of API calls. I've also updated this test to more explicitly check for a possible cause of customer issue b/332850329.

query_resource,
job_resource,
query_resource_done,
job_resource_done,
query_page_resource,
)
client = _make_client(self.PROJECT, connection=conn)
job = self._get_target_class().from_api_repr(job_resource, client)
Expand Down Expand Up @@ -1014,7 +1019,14 @@ def test_result(self):
timeout=None,
)
conn.api_request.assert_has_calls(
[query_results_call, query_results_call, reload_call, query_page_call]
[
reload_call,
query_results_call,
reload_call,
query_results_call,
reload_call,
query_page_call,
]
)

def test_result_dry_run(self):
Expand Down Expand Up @@ -1254,9 +1266,13 @@ def test_result_w_retry(self):
}

connection = make_connection(
exceptions.NotFound("not normally retriable"),
job_resource,
exceptions.NotFound("not normally retriable"),
query_resource,
exceptions.NotFound("not normally retriable"),
job_resource,
exceptions.NotFound("not normally retriable"),
query_resource_done,
exceptions.NotFound("not normally retriable"),
job_resource_done,
Expand Down Expand Up @@ -1289,7 +1305,18 @@ def test_result_w_retry(self):
)

connection.api_request.assert_has_calls(
[query_results_call, query_results_call, reload_call]
[
reload_call,
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here. Can I get some clarity on what we are doing and looking for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some explanation here as well as above in the make_connection() call.

reload_call,
query_results_call,
query_results_call,
reload_call,
reload_call,
query_results_call,
query_results_call,
reload_call,
reload_call,
]
)

def test_result_w_empty_schema(self):
Expand All @@ -1316,37 +1343,6 @@ def test_result_w_empty_schema(self):
self.assertEqual(result.location, "asia-northeast1")
self.assertEqual(result.query_id, "xyz-abc")

def test_result_invokes_begins(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As of #967 released in google-cloud-bigquery 3.0.0, the _begin method is no longer used for query jobs.

begun_resource = self._make_resource()
incomplete_resource = {
"jobComplete": False,
"jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID},
"schema": {"fields": [{"name": "col1", "type": "STRING"}]},
}
query_resource = copy.deepcopy(incomplete_resource)
query_resource["jobComplete"] = True
done_resource = copy.deepcopy(begun_resource)
done_resource["status"] = {"state": "DONE"}
connection = make_connection(
begun_resource,
incomplete_resource,
query_resource,
done_resource,
query_resource,
)
client = _make_client(project=self.PROJECT, connection=connection)
job = self._make_one(self.JOB_ID, self.QUERY, client)

job.result()

self.assertEqual(len(connection.api_request.call_args_list), 4)
begin_request = connection.api_request.call_args_list[0]
query_request = connection.api_request.call_args_list[2]
reload_request = connection.api_request.call_args_list[3]
self.assertEqual(begin_request[1]["method"], "POST")
self.assertEqual(query_request[1]["method"], "GET")
self.assertEqual(reload_request[1]["method"], "GET")

def test_result_w_timeout(self):
import google.cloud.bigquery.client

Expand Down
33 changes: 21 additions & 12 deletions tests/unit/test__job_helpers.py
Expand Up @@ -896,18 +896,10 @@ def test_query_and_wait_caches_completed_query_results_more_pages():
timeout=None,
)

# TODO(swast): Fetching job metadata isn't necessary in this case.
jobs_get_path = "/projects/response-project/jobs/response-job-id"
client._call_api.assert_any_call(
None, # retry
span_name="BigQuery.job.reload",
span_attributes={"path": jobs_get_path},
job_ref=mock.ANY,
method="GET",
path=jobs_get_path,
query_params={"location": "response-location"},
timeout=None,
)
# Note: There is no get call to
# "/projects/response-project/jobs/response-job-id", because fetching job
# metadata isn't necessary in this case. The job already completed in
# jobs.query and we don't need the full job metadata in query_and_wait.

# Fetch the remaining two pages.
jobs_get_query_results_path = "/projects/response-project/queries/response-job-id"
Expand Down Expand Up @@ -944,6 +936,7 @@ def test_query_and_wait_incomplete_query():
Client._list_rows_from_query_results, client
)
client._call_api.side_effect = (
# jobs.query
{
"jobReference": {
"projectId": "response-project",
Expand All @@ -952,6 +945,16 @@ def test_query_and_wait_incomplete_query():
},
"jobComplete": False,
},
# jobs.get
{
"jobReference": {
"projectId": "response-project",
"jobId": "response-job-id",
"location": "response-location",
},
"status": {"state": "RUNNING"},
},
# jobs.getQueryResults with max_results=0
{
"jobReference": {
"projectId": "response-project",
Expand All @@ -968,13 +971,18 @@ def test_query_and_wait_incomplete_query():
],
},
},
# jobs.get
{
"jobReference": {
"projectId": "response-project",
"jobId": "response-job-id",
"location": "response-location",
},
"status": {"state": "DONE"},
},
# jobs.getQueryResults
# Note: No more jobs.getQueryResults with max_results=0 because the
# previous call to jobs.getQueryResults returned with jobComplete=True.
{
"rows": [
{"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]},
Expand All @@ -987,6 +995,7 @@ def test_query_and_wait_incomplete_query():
"totalRows": 2,
"pageToken": "page-2",
},
# jobs.getQueryResults
{
"rows": [
{"f": [{"v": "Pearl Slaghoople"}, {"v": "53"}]},
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_job_retry.py
Expand Up @@ -187,8 +187,8 @@ def api_request(method, path, query_params=None, data=None, **kw):
with pytest.raises(google.api_core.exceptions.RetryError):
job.result()

# We never got a successful job, so the job id never changed:
assert job.job_id == orig_job_id
# We retried the job at least once, so we should have generated a new job ID.
assert job.job_id != orig_job_id

# We failed because we couldn't succeed after 120 seconds.
# But we can try again:
Expand Down