Fix: Pagination by bounds (#2654)

* fix: pagination missing rows

* fix: separate functions

* fix: return both bounds from query

* fix: wiring

* fix: func

* fix: order col

* fix: bug

* fix: math is hard

* fix: more math

* fix: math and math and math

* fix: slightly more math

* fix: placeholders 🤦

* fix: where clause

* fix: math!

* fix: schema

* refactor: try with `CEIL`

* fix: mathin up a storm

* fix: I was actually a math major in college, who knew

* fix: copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix: copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
matt
2025-12-15 13:07:51 -05:00
committed by GitHub
parent 07af840bfe
commit 23db2a4fac
9 changed files with 658 additions and 117 deletions
+91 -5
View File
@@ -1878,11 +1878,14 @@ $$;
CREATE OR REPLACE FUNCTION list_paginated_payloads_for_offload(
partition_date date,
limit_param int,
last_tenant_id uuid,
last_inserted_at timestamptz,
last_id bigint,
last_type v1_payload_type
last_type v1_payload_type,
next_tenant_id uuid,
next_inserted_at timestamptz,
next_id bigint,
next_type v1_payload_type
) RETURNS TABLE (
tenant_id UUID,
id BIGINT,
@@ -1916,12 +1919,95 @@ BEGIN
SELECT tenant_id, id, inserted_at, external_id, type, location,
external_location_key, inline_content, updated_at
FROM %I
WHERE (tenant_id, inserted_at, id, type) >= ($1, $2, $3, $4)
WHERE
(tenant_id, inserted_at, id, type) >= ($1, $2, $3, $4)
AND (tenant_id, inserted_at, id, type) <= ($5, $6, $7, $8)
ORDER BY tenant_id, inserted_at, id, type
LIMIT $5
', source_partition_name);
RETURN QUERY EXECUTE query USING last_tenant_id, last_inserted_at, last_id, last_type, limit_param;
RETURN QUERY EXECUTE query USING last_tenant_id, last_inserted_at, last_id, last_type, next_tenant_id, next_inserted_at, next_id, next_type;
END;
$$;
CREATE OR REPLACE FUNCTION create_payload_offload_range_chunks(
partition_date date,
window_size int,
chunk_size int,
last_tenant_id uuid,
last_inserted_at timestamptz,
last_id bigint,
last_type v1_payload_type
) RETURNS TABLE (
lower_tenant_id UUID,
lower_id BIGINT,
lower_inserted_at TIMESTAMPTZ,
lower_type v1_payload_type,
upper_tenant_id UUID,
upper_id BIGINT,
upper_inserted_at TIMESTAMPTZ,
upper_type v1_payload_type
)
LANGUAGE plpgsql AS
$$
DECLARE
partition_date_str varchar;
source_partition_name varchar;
query text;
BEGIN
IF partition_date IS NULL THEN
RAISE EXCEPTION 'partition_date parameter cannot be NULL';
END IF;
SELECT to_char(partition_date, 'YYYYMMDD') INTO partition_date_str;
SELECT format('v1_payload_%s', partition_date_str) INTO source_partition_name;
IF NOT EXISTS (SELECT 1 FROM pg_tables WHERE tablename = source_partition_name) THEN
RAISE EXCEPTION 'Partition % does not exist', source_partition_name;
END IF;
query := format('
WITH paginated AS (
SELECT tenant_id, id, inserted_at, type, ROW_NUMBER() OVER (ORDER BY tenant_id, inserted_at, id, type) AS rn
FROM %I
WHERE (tenant_id, inserted_at, id, type) > ($1, $2, $3, $4)
ORDER BY tenant_id, inserted_at, id, type
LIMIT $5::INTEGER
), lower_bounds AS (
SELECT rn::INTEGER / $6::INTEGER AS batch_ix, tenant_id::UUID, id::BIGINT, inserted_at::TIMESTAMPTZ, type::v1_payload_type
FROM paginated
WHERE MOD(rn, $6::INTEGER) = 1
), upper_bounds AS (
SELECT
-- Using `CEIL` and subtracting 1 here to make the `batch_ix` zero indexed like the `lower_bounds` one is.
-- We need the `CEIL` to handle the case where the number of rows in the window is not evenly divisible by the batch size,
-- because without CEIL if e.g. there were 5 rows in the window and a batch size of two and we did integer division, we would end
-- up with batches of index 0, 1, and 1 after dividing and subtracting. With float division and `CEIL`, we get 0, 1, and 2 as expected.
-- Then we need to subtract one because we compute the batch index by using integer division on the lower bounds, which are all zero indexed.
CEIL(rn::FLOAT / $6::FLOAT) - 1 AS batch_ix,
tenant_id::UUID,
id::BIGINT,
inserted_at::TIMESTAMPTZ,
type::v1_payload_type
FROM paginated
-- We want to include either the last row of each batch, or the last row of the entire paginated set, which may not line up with a batch end.
WHERE MOD(rn, $6::INTEGER) = 0 OR rn = (SELECT MAX(rn) FROM paginated)
)
SELECT
lb.tenant_id AS lower_tenant_id,
lb.id AS lower_id,
lb.inserted_at AS lower_inserted_at,
lb.type AS lower_type,
ub.tenant_id AS upper_tenant_id,
ub.id AS upper_id,
ub.inserted_at AS upper_inserted_at,
ub.type AS upper_type
FROM lower_bounds lb
JOIN upper_bounds ub ON lb.batch_ix = ub.batch_ix
ORDER BY lb.tenant_id, lb.inserted_at, lb.id, lb.type
', source_partition_name);
RETURN QUERY EXECUTE query USING last_tenant_id, last_inserted_at, last_id, last_type, window_size, chunk_size;
END;
$$;
+84 -5
View File
@@ -920,10 +920,12 @@ $$;
CREATE OR REPLACE FUNCTION list_paginated_olap_payloads_for_offload(
partition_date date,
limit_param int,
last_tenant_id uuid,
last_external_id uuid,
last_inserted_at timestamptz
last_inserted_at timestamptz,
next_tenant_id uuid,
next_external_id uuid,
next_inserted_at timestamptz
) RETURNS TABLE (
tenant_id UUID,
external_id UUID,
@@ -954,12 +956,89 @@ BEGIN
query := format('
SELECT tenant_id, external_id, location, external_location_key, inline_content, inserted_at, updated_at
FROM %I
WHERE (tenant_id, external_id, inserted_at) >= ($1, $2, $3)
WHERE
(tenant_id, external_id, inserted_at) >= ($1, $2, $3)
AND (tenant_id, external_id, inserted_at) <= ($4, $5, $6)
ORDER BY tenant_id, external_id, inserted_at
LIMIT $4
', source_partition_name);
RETURN QUERY EXECUTE query USING last_tenant_id, last_external_id, last_inserted_at, limit_param;
RETURN QUERY EXECUTE query USING last_tenant_id, last_external_id, last_inserted_at, next_tenant_id, next_external_id, next_inserted_at;
END;
$$;
CREATE OR REPLACE FUNCTION create_olap_payload_offload_range_chunks(
partition_date date,
window_size int,
chunk_size int,
last_tenant_id uuid,
last_external_id uuid,
last_inserted_at timestamptz
) RETURNS TABLE (
lower_tenant_id UUID,
lower_external_id UUID,
lower_inserted_at TIMESTAMPTZ,
upper_tenant_id UUID,
upper_external_id UUID,
upper_inserted_at TIMESTAMPTZ
)
LANGUAGE plpgsql AS
$$
DECLARE
partition_date_str varchar;
source_partition_name varchar;
query text;
BEGIN
IF partition_date IS NULL THEN
RAISE EXCEPTION 'partition_date parameter cannot be NULL';
END IF;
SELECT to_char(partition_date, 'YYYYMMDD') INTO partition_date_str;
SELECT format('v1_payloads_olap_%s', partition_date_str) INTO source_partition_name;
IF NOT EXISTS (SELECT 1 FROM pg_tables WHERE tablename = source_partition_name) THEN
RAISE EXCEPTION 'Partition % does not exist', source_partition_name;
END IF;
query := format('
WITH paginated AS (
SELECT tenant_id, external_id, inserted_at, ROW_NUMBER() OVER (ORDER BY tenant_id, external_id, inserted_at) AS rn
FROM %I
WHERE (tenant_id, external_id, inserted_at) > ($1, $2, $3)
ORDER BY tenant_id, external_id, inserted_at
LIMIT $4
), lower_bounds AS (
SELECT rn::INTEGER / $5::INTEGER AS batch_ix, tenant_id::UUID, external_id::UUID, inserted_at::TIMESTAMPTZ
FROM paginated
WHERE MOD(rn, $5::INTEGER) = 1
), upper_bounds AS (
SELECT
-- Using `CEIL` and subtracting 1 here to make the `batch_ix` zero indexed like the `lower_bounds` one is.
-- We need the `CEIL` to handle the case where the number of rows in the window is not evenly divisible by the batch size,
-- because without CEIL if e.g. there were 5 rows in the window and a batch size of two and we did integer division, we would end
-- up with batches of index 0, 1, and 1 after dividing and subtracting. With float division and `CEIL`, we get 0, 1, and 2 as expected.
-- Then we need to subtract one because we compute the batch index by using integer division on the lower bounds, which are all zero indexed.
CEIL(rn::FLOAT / $5::FLOAT) - 1 AS batch_ix,
tenant_id::UUID,
external_id::UUID,
inserted_at::TIMESTAMPTZ
FROM paginated
-- We want to include either the last row of each batch, or the last row of the entire paginated set, which may not line up with a batch end.
WHERE MOD(rn, $5::INTEGER) = 0 OR rn = (SELECT MAX(rn) FROM paginated)
)
SELECT
lb.tenant_id AS lower_tenant_id,
lb.external_id AS lower_external_id,
lb.inserted_at AS lower_inserted_at,
ub.tenant_id AS upper_tenant_id,
ub.external_id AS upper_external_id,
ub.inserted_at AS upper_inserted_at
FROM lower_bounds lb
JOIN upper_bounds ub ON lb.batch_ix = ub.batch_ix
ORDER BY lb.tenant_id, lb.external_id, lb.inserted_at
', source_partition_name);
RETURN QUERY EXECUTE query USING last_tenant_id, last_external_id, last_inserted_at, window_size, chunk_size;
END;
$$;