Skip to content

gh-132657: Avoid locks and refcounts in frozenset operations #136107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

eendebakpt
Copy link
Contributor

@eendebakpt eendebakpt commented Jun 29, 2025

See the corresponding issue for the rationale.

The PR improves performance of the frozenset operations under the FT build. A benchmark:

Script
import time


s = {1, 2, 3}
fs = frozenset(s)

dt = 0
dtd = 0
dtf = 0
dtdf = 0

niter = 30_000
nloop = 120


def g():
    t0 = time.perf_counter()
    z = 2
    for _ in range(niter):
        z in s
    return time.perf_counter() - t0


def gfrozen():
    t0 = time.perf_counter()
    z = 2
    for _ in range(niter):
        z in fs
    return time.perf_counter() - t0


def g_dunder():
    t0 = time.perf_counter()
    z = 2
    contains_dunder = s.__contains__
    for _ in range(niter):
        contains_dunder(z)
    return time.perf_counter() - t0


def gfrozen_dunder():
    t0 = time.perf_counter()
    z = 2
    contains_dunder = fs.__contains__
    for _ in range(niter):
        contains_dunder(z)
    return time.perf_counter() - t0


# interleaved benchmark of the methods
for ii in range(nloop):
    # print('-')
    dt += g()
    dtf += gfrozen()
    dtd += g_dunder()
    dtdf += gfrozen_dunder()


print(f"set dt {dt * 1e3:.2f} [ms]")
print(f"frozenset dt {dtf * 1e3:.2f} [ms]")

print(f"set dunder dt {dtd * 1e3:.2f} [ms]")
print(f"frozenset dunder dt {dtdf * 1e3:.2f} [ms]")

Results on main (interleaved benchmark)

set dt 170.07 [ms]
frozenset dt 170.60 [ms]
set dunder dt 210.17 [ms]
frozenset dunder dt 182.84 [ms]

Results on PR (interleaved benchmark)

set dt 177.96 [ms]
frozenset dt 145.31 [ms]
set dunder dt 220.28 [ms]
frozenset dunder dt 185.37 [ms]

The benchmark "frozenset" is improved since this triggers the path taken by the adaptive interpreter for z in s.
The two dunder benchmarks use s.__contains__(z).

The FT scaling performance is not improved a lot. This is probably due to refcount contention on the global objects (the set and frozenset). Because the operation itself is faster, the scaling performance itself looks worse.

Script adapted from the ftscalingbench.py
import copy
import os
import queue
import sys
import threading
import time

WORK_SCALE = 100

ALL_BENCHMARKS = {}

threads = []
in_queues = []
out_queues = []


def register_benchmark(func):
    ALL_BENCHMARKS[func.__name__] = func
    return func


small_tuple = (1, 2, 3, 4)
small_list = [1, 2, 3, 4]
small_set = {1, 2, 3, 4}
small_frozenset = frozenset({1, 2, 3, 4})


@register_benchmark
def tuple_contains():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in small_tuple


@register_benchmark
def list_contains():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in small_list


@register_benchmark
def frozenset_contains():
    z = 1
    for i in range(500 * WORK_SCALE):
        z in small_frozenset

@register_benchmark
def frozenset_contains_dunder():
    z = 1
    w = small_frozenset.__contains__
    for i in range(500 * WORK_SCALE):
        w(z)


@register_benchmark
def set_contains():
    z = 1
    w=small_set.__contains__
    for i in range(500 * WORK_SCALE):
        w(z)


@register_benchmark
def set_contains_alt():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in tuple(small_set)


@register_benchmark
def shallow_copy():
    x = [1, 2, 3]
    shallow_copy = copy.copy
    for i in range(400 * WORK_SCALE):
        shallow_copy(x)


@register_benchmark
def deepcopy():
    x = {"list": [1, 2], "tuple": (1, None)}
    deepcopy = copy.deepcopy
    for i in range(80 * WORK_SCALE):
        deepcopy(x)


module = sys.modules[__name__]

thread_local = threading.local()


def bench_one_thread(func):
    t0 = time.perf_counter_ns()
    func()
    t1 = time.perf_counter_ns()
    return t1 - t0


def bench_parallel(func):
    t0 = time.perf_counter_ns()
    for inq in in_queues:
        inq.put(func)
    for outq in out_queues:
        outq.get()
    t1 = time.perf_counter_ns()
    return t1 - t0


def benchmark(func):
    delta_one_thread = bench_one_thread(func)
    delta_many_threads = bench_parallel(func)

    speedup = delta_one_thread * len(threads) / delta_many_threads
    if speedup >= 1:
        factor = speedup
        direction = "faster"
    else:
        factor = 1 / speedup
        direction = "slower"

    use_color = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
    color = reset_color = ""
    if use_color:
        if speedup <= 1.1:
            color = "\x1b[31m"  # red
        elif speedup < len(threads) / 2:
            color = "\x1b[33m"  # yellow
        reset_color = "\x1b[0m"

    print(f"{color}{func.__name__:<25} {round(factor, 1):>4}x {direction}{reset_color}")


def determine_num_threads_and_affinity():
    if sys.platform != "linux":
        return [None] * os.cpu_count()

    # Try to use `lscpu -p` on Linux
    import subprocess

    try:
        output = subprocess.check_output(["lscpu", "-p=cpu,node,core,MAXMHZ"], text=True, env={"LC_NUMERIC": "C"})
    except (FileNotFoundError, subprocess.CalledProcessError):
        return [None] * os.cpu_count()

    table = []
    for line in output.splitlines():
        if line.startswith("#"):
            continue
        cpu, node, core, maxhz = line.split(",")
        if maxhz == "":
            maxhz = "0"
        table.append((int(cpu), int(node), int(core), float(maxhz)))

    cpus = []
    cores = set()
    max_mhz_all = max(row[3] for row in table)
    for cpu, node, core, maxmhz in table:
        # Choose only CPUs on the same node, unique cores, and try to avoid
        # "efficiency" cores.
        if node == 0 and core not in cores and maxmhz == max_mhz_all:
            cpus.append(cpu)
            cores.add(core)
    return cpus


def thread_run(cpu, in_queue, out_queue):
    if cpu is not None and hasattr(os, "sched_setaffinity"):
        # Set the affinity for the current thread
        os.sched_setaffinity(0, (cpu,))

    while True:
        func = in_queue.get()
        if func is None:
            break
        func()
        out_queue.put(None)


def initialize_threads(opts):
    if opts.threads == -1:
        cpus = determine_num_threads_and_affinity()
    else:
        cpus = [None] * opts.threads  # don't set affinity

    print(f"Running benchmarks with {len(cpus)} threads")
    for cpu in cpus:
        inq = queue.Queue()
        outq = queue.Queue()
        in_queues.append(inq)
        out_queues.append(outq)
        t = threading.Thread(target=thread_run, args=(cpu, inq, outq), daemon=True)
        threads.append(t)
        t.start()


def main(opts):
    global WORK_SCALE
    if not hasattr(sys, "_is_gil_enabled") or sys._is_gil_enabled():
        sys.stderr.write("expected to be run with the  GIL disabled\n")

    benchmark_names = opts.benchmarks
    if benchmark_names:
        for name in benchmark_names:
            if name not in ALL_BENCHMARKS:
                sys.stderr.write(f"Unknown benchmark: {name}\n")
                sys.exit(1)
    else:
        benchmark_names = ALL_BENCHMARKS.keys()

    WORK_SCALE = opts.scale

    if not opts.baseline_only:
        initialize_threads(opts)

    do_bench = not opts.baseline_only and not opts.parallel_only
    for name in benchmark_names:
        func = ALL_BENCHMARKS[name]
        if do_bench:
            benchmark(func)
            continue

        if opts.parallel_only:
            delta_ns = bench_parallel(func)
        else:
            delta_ns = bench_one_thread(func)

        time_ms = delta_ns / 1_000_000
        print(f"{func.__name__:<18} {time_ms:.1f} ms")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--threads", type=int, default=-1, help="number of threads to use")
    parser.add_argument("--scale", type=int, default=100, help="work scale factor for the benchmark (default=100)")
    parser.add_argument(
        "--baseline-only", default=False, action="store_true", help="only run the baseline benchmarks (single thread)"
    )
    parser.add_argument(
        "--parallel-only", default=False, action="store_true", help="only run the parallel benchmark (many threads)"
    )
    parser.add_argument("benchmarks", nargs="*", help="benchmarks to run")
    options = parser.parse_args()
    main(options)

Results on main

Running benchmarks with 4 threads
tuple_contains             3.8x faster
list_contains              2.1x slower
frozenset_contains         2.8x slower
frozenset_contains_dunder  4.0x faster
set_contains               1.1x slower
set_contains_alt           1.3x slower
shallow_copy               1.0x slower
deepcopy                   1.8x faster

Results on PR

Running benchmarks with 4 threads
tuple_contains             3.8x faster
list_contains              2.1x slower
frozenset_contains         4.4x slower
frozenset_contains_dunder  3.8x faster
set_contains               1.3x slower
set_contains_alt           1.3x slower
shallow_copy               1.1x faster
deepcopy                   1.6x faster

return NULL;
if (table != so->table || entry->key != startkey)
return set_lookkey(so, key, hash);
if (frozenset) {
Copy link
Contributor Author

@eendebakpt eendebakpt Jun 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: the special case for unicode a couple of lines above was introduced to avoid the check on mutated tables. See commit eendebakpt@93035c4 that moved the code inline.

With the check for an exact frozenset we avoid the same checks.

@rhettinger rhettinger removed their request for review June 30, 2025 06:07
@ZeroIntensity ZeroIntensity added the performance Performance or resource usage label Jul 1, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
awaiting review performance Performance or resource usage
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants