gh-132657: Avoid locks and refcounts in frozenset operations #136107

eendebakpt · 2025-06-29T20:27:04Z

See the corresponding issue for the rationale.

The PR improves performance of the frozenset operations under the FT build. A benchmark:

Script

import time


s = {1, 2, 3}
fs = frozenset(s)

dt = 0
dtd = 0
dtf = 0
dtdf = 0

niter = 30_000
nloop = 120


def g():
    t0 = time.perf_counter()
    z = 2
    for _ in range(niter):
        z in s
    return time.perf_counter() - t0


def gfrozen():
    t0 = time.perf_counter()
    z = 2
    for _ in range(niter):
        z in fs
    return time.perf_counter() - t0


def g_dunder():
    t0 = time.perf_counter()
    z = 2
    contains_dunder = s.__contains__
    for _ in range(niter):
        contains_dunder(z)
    return time.perf_counter() - t0


def gfrozen_dunder():
    t0 = time.perf_counter()
    z = 2
    contains_dunder = fs.__contains__
    for _ in range(niter):
        contains_dunder(z)
    return time.perf_counter() - t0


# interleaved benchmark of the methods
for ii in range(nloop):
    # print('-')
    dt += g()
    dtf += gfrozen()
    dtd += g_dunder()
    dtdf += gfrozen_dunder()


print(f"set dt {dt * 1e3:.2f} [ms]")
print(f"frozenset dt {dtf * 1e3:.2f} [ms]")

print(f"set dunder dt {dtd * 1e3:.2f} [ms]")
print(f"frozenset dunder dt {dtdf * 1e3:.2f} [ms]")

Results on main (interleaved benchmark)

set dt 170.07 [ms]
frozenset dt 170.60 [ms]
set dunder dt 210.17 [ms]
frozenset dunder dt 182.84 [ms]

Results on PR (interleaved benchmark)

set dt 177.96 [ms]
frozenset dt 145.31 [ms]
set dunder dt 220.28 [ms]
frozenset dunder dt 185.37 [ms]

The benchmark "frozenset" is improved since this triggers the path taken by the adaptive interpreter for z in s.
The two dunder benchmarks use s.__contains__(z).

The FT scaling performance is not improved a lot. This is probably due to refcount contention on the global objects (the set and frozenset). Because the operation itself is faster, the scaling performance itself looks worse.

Script adapted from the ftscalingbench.py

import copy
import os
import queue
import sys
import threading
import time

WORK_SCALE = 100

ALL_BENCHMARKS = {}

threads = []
in_queues = []
out_queues = []


def register_benchmark(func):
    ALL_BENCHMARKS[func.__name__] = func
    return func


small_tuple = (1, 2, 3, 4)
small_list = [1, 2, 3, 4]
small_set = {1, 2, 3, 4}
small_frozenset = frozenset({1, 2, 3, 4})


@register_benchmark
def tuple_contains():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in small_tuple


@register_benchmark
def list_contains():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in small_list


@register_benchmark
def frozenset_contains():
    z = 1
    for i in range(500 * WORK_SCALE):
        z in small_frozenset

@register_benchmark
def frozenset_contains_dunder():
    z = 1
    w = small_frozenset.__contains__
    for i in range(500 * WORK_SCALE):
        w(z)


@register_benchmark
def set_contains():
    z = 1
    w=small_set.__contains__
    for i in range(500 * WORK_SCALE):
        w(z)


@register_benchmark
def set_contains_alt():
    z = 0
    for i in range(500 * WORK_SCALE):
        z in tuple(small_set)


@register_benchmark
def shallow_copy():
    x = [1, 2, 3]
    shallow_copy = copy.copy
    for i in range(400 * WORK_SCALE):
        shallow_copy(x)


@register_benchmark
def deepcopy():
    x = {"list": [1, 2], "tuple": (1, None)}
    deepcopy = copy.deepcopy
    for i in range(80 * WORK_SCALE):
        deepcopy(x)


module = sys.modules[__name__]

thread_local = threading.local()


def bench_one_thread(func):
    t0 = time.perf_counter_ns()
    func()
    t1 = time.perf_counter_ns()
    return t1 - t0


def bench_parallel(func):
    t0 = time.perf_counter_ns()
    for inq in in_queues:
        inq.put(func)
    for outq in out_queues:
        outq.get()
    t1 = time.perf_counter_ns()
    return t1 - t0


def benchmark(func):
    delta_one_thread = bench_one_thread(func)
    delta_many_threads = bench_parallel(func)

    speedup = delta_one_thread * len(threads) / delta_many_threads
    if speedup >= 1:
        factor = speedup
        direction = "faster"
    else:
        factor = 1 / speedup
        direction = "slower"

    use_color = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
    color = reset_color = ""
    if use_color:
        if speedup <= 1.1:
            color = "\x1b[31m"  # red
        elif speedup < len(threads) / 2:
            color = "\x1b[33m"  # yellow
        reset_color = "\x1b[0m"

    print(f"{color}{func.__name__:<25} {round(factor, 1):>4}x {direction}{reset_color}")


def determine_num_threads_and_affinity():
    if sys.platform != "linux":
        return [None] * os.cpu_count()

    # Try to use `lscpu -p` on Linux
    import subprocess

    try:
        output = subprocess.check_output(["lscpu", "-p=cpu,node,core,MAXMHZ"], text=True, env={"LC_NUMERIC": "C"})
    except (FileNotFoundError, subprocess.CalledProcessError):
        return [None] * os.cpu_count()

    table = []
    for line in output.splitlines():
        if line.startswith("#"):
            continue
        cpu, node, core, maxhz = line.split(",")
        if maxhz == "":
            maxhz = "0"
        table.append((int(cpu), int(node), int(core), float(maxhz)))

    cpus = []
    cores = set()
    max_mhz_all = max(row[3] for row in table)
    for cpu, node, core, maxmhz in table:
        # Choose only CPUs on the same node, unique cores, and try to avoid
        # "efficiency" cores.
        if node == 0 and core not in cores and maxmhz == max_mhz_all:
            cpus.append(cpu)
            cores.add(core)
    return cpus


def thread_run(cpu, in_queue, out_queue):
    if cpu is not None and hasattr(os, "sched_setaffinity"):
        # Set the affinity for the current thread
        os.sched_setaffinity(0, (cpu,))

    while True:
        func = in_queue.get()
        if func is None:
            break
        func()
        out_queue.put(None)


def initialize_threads(opts):
    if opts.threads == -1:
        cpus = determine_num_threads_and_affinity()
    else:
        cpus = [None] * opts.threads  # don't set affinity

    print(f"Running benchmarks with {len(cpus)} threads")
    for cpu in cpus:
        inq = queue.Queue()
        outq = queue.Queue()
        in_queues.append(inq)
        out_queues.append(outq)
        t = threading.Thread(target=thread_run, args=(cpu, inq, outq), daemon=True)
        threads.append(t)
        t.start()


def main(opts):
    global WORK_SCALE
    if not hasattr(sys, "_is_gil_enabled") or sys._is_gil_enabled():
        sys.stderr.write("expected to be run with the  GIL disabled\n")

    benchmark_names = opts.benchmarks
    if benchmark_names:
        for name in benchmark_names:
            if name not in ALL_BENCHMARKS:
                sys.stderr.write(f"Unknown benchmark: {name}\n")
                sys.exit(1)
    else:
        benchmark_names = ALL_BENCHMARKS.keys()

    WORK_SCALE = opts.scale

    if not opts.baseline_only:
        initialize_threads(opts)

    do_bench = not opts.baseline_only and not opts.parallel_only
    for name in benchmark_names:
        func = ALL_BENCHMARKS[name]
        if do_bench:
            benchmark(func)
            continue

        if opts.parallel_only:
            delta_ns = bench_parallel(func)
        else:
            delta_ns = bench_one_thread(func)

        time_ms = delta_ns / 1_000_000
        print(f"{func.__name__:<18} {time_ms:.1f} ms")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--threads", type=int, default=-1, help="number of threads to use")
    parser.add_argument("--scale", type=int, default=100, help="work scale factor for the benchmark (default=100)")
    parser.add_argument(
        "--baseline-only", default=False, action="store_true", help="only run the baseline benchmarks (single thread)"
    )
    parser.add_argument(
        "--parallel-only", default=False, action="store_true", help="only run the parallel benchmark (many threads)"
    )
    parser.add_argument("benchmarks", nargs="*", help="benchmarks to run")
    options = parser.parse_args()
    main(options)

Results on main

Running benchmarks with 4 threads
tuple_contains             3.8x faster
list_contains              2.1x slower
frozenset_contains         2.8x slower
frozenset_contains_dunder  4.0x faster
set_contains               1.1x slower
set_contains_alt           1.3x slower
shallow_copy               1.0x slower
deepcopy                   1.8x faster

Results on PR

Running benchmarks with 4 threads
tuple_contains             3.8x faster
list_contains              2.1x slower
frozenset_contains         4.4x slower
frozenset_contains_dunder  3.8x faster
set_contains               1.3x slower
set_contains_alt           1.3x slower
shallow_copy               1.1x faster
deepcopy                   1.6x faster

Issue: copy.copy and copy.deepcopy scale poorly with free-threading #132657

eendebakpt · 2025-06-29T20:33:14Z

Objects/setobject.c

-                    return NULL;
-                if (table != so->table || entry->key != startkey)
-                    return set_lookkey(so, key, hash);
+                if (frozenset) {


Note: the special case for unicode a couple of lines above was introduced to avoid the check on mutated tables. See commit eendebakpt@93035c4 that moved the code inline.

With the check for an exact frozenset we avoid the same checks.

Avoid locks are and refcounts in frozenset operatins

e630b64

eendebakpt requested a review from rhettinger as a code owner June 29, 2025 20:27

bedevere-app bot added the awaiting review label Jun 29, 2025

bedevere-app bot mentioned this pull request Jun 29, 2025

copy.copy and copy.deepcopy scale poorly with free-threading #132657

Open

avoid table check

40d2d29

eendebakpt commented Jun 29, 2025

View reviewed changes

rhettinger removed their request for review June 30, 2025 06:07

ZeroIntensity added the performance Performance or resource usage label Jul 1, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

gh-132657: Avoid locks and refcounts in frozenset operations #136107

gh-132657: Avoid locks and refcounts in frozenset operations #136107

eendebakpt commented Jun 29, 2025 •

edited

Loading

Uh oh!

eendebakpt Jun 29, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

gh-132657: Avoid locks and refcounts in frozenset operations #136107

Are you sure you want to change the base?

gh-132657: Avoid locks and refcounts in frozenset operations #136107

Conversation

eendebakpt commented Jun 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

eendebakpt Jun 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

eendebakpt commented Jun 29, 2025 •

edited

Loading

eendebakpt Jun 29, 2025 •

edited

Loading