# Copyright Materialize, Inc. and contributors. All rights reserved. # # Use of this software is governed by the Business Source License # included in the LICENSE file at the root of this repository. # # As of the Change Date specified in that file, in accordance with # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. # Tests the new syntax for GROUP SIZE query hints proposed in: # https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/20230829_topk_size_hint.md # Additionally, the tests below include scenarios that validate backwards compability in the # hint syntax according to what is described in the design doc. statement ok CREATE TABLE teachers (id INT, name TEXT); statement ok CREATE TABLE sections (id INT, teacher_id INT, course_id INT, schedule TEXT); # Illustrates that the old hint applies to multiple operators in a single query block. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT DISTINCT ON(teacher_id) id, teacher_id, MAX(course_id) FROM sections GROUP BY id, teacher_id OPTIONS (EXPECTED GROUP SIZE = 1000) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=1000 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 exp_group_size=1000 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] exp_group_size=1000 Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates a workaround with the old hint to apply different values to different operators in # a single query block. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT id, teacher_id, max_course_id FROM ( SELECT DISTINCT ON(teacher_id) id, teacher_id, max_course_id FROM ( SELECT id, teacher_id, MAX(course_id) AS max_course_id FROM sections GROUP BY id, teacher_id OPTIONS (EXPECTED GROUP SIZE = 1000) ) OPTIONS (EXPECTED GROUP SIZE = 60) ORDER BY teacher_id, id ) OPTIONS (EXPECTED GROUP SIZE = 50) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=50 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 exp_group_size=60 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] exp_group_size=1000 Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates that new hints apply cleanly without ambiguity to different query blocks. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT id, teacher_id, max_course_id FROM ( SELECT DISTINCT ON(teacher_id) id, teacher_id, max_course_id FROM ( SELECT id, teacher_id, MAX(course_id) AS max_course_id FROM sections GROUP BY id, teacher_id OPTIONS (AGGREGATE INPUT GROUP SIZE = 1000) ) OPTIONS (DISTINCT ON INPUT GROUP SIZE = 60) ORDER BY teacher_id, id ) OPTIONS (LIMIT INPUT GROUP SIZE = 50) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=50 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 exp_group_size=60 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] exp_group_size=1000 Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates that new hints apply without ambiguity in a single query block. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT DISTINCT ON(teacher_id) id, teacher_id, MAX(course_id) FROM sections GROUP BY id, teacher_id OPTIONS ( AGGREGATE INPUT GROUP SIZE = 1000, DISTINCT ON INPUT GROUP SIZE = 60, LIMIT INPUT GROUP SIZE = 50) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=50 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 exp_group_size=60 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] exp_group_size=1000 Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates that partial combinations of the new hints in a single query block, # namely AGGREGATE and LIMIT INPUT GROUP SIZE. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT DISTINCT ON(teacher_id) id, teacher_id, MAX(course_id) FROM sections GROUP BY id, teacher_id OPTIONS ( AGGREGATE INPUT GROUP SIZE = 1000, LIMIT INPUT GROUP SIZE = 50) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=50 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] exp_group_size=1000 Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates that partial combinations of the new hints in a single query block, # namely LIMIT and DISTINCT ON INPUT GROUP SIZE. statement ok CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT DISTINCT ON(teacher_id) id, teacher_id, MAX(course_id) FROM sections GROUP BY id, teacher_id OPTIONS ( LIMIT INPUT GROUP SIZE = 50, DISTINCT ON INPUT GROUP SIZE = 60) ORDER BY teacher_id, id LIMIT 2; query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW distinct_on_group_by_limit; ---- materialize.public.distinct_on_group_by_limit: TopK order_by=[#1{teacher_id} asc nulls_last, #0{id} asc nulls_last] limit=2 exp_group_size=50 TopK group_by=[#1{teacher_id}] order_by=[#0{id} asc nulls_last] limit=1 exp_group_size=60 Reduce group_by=[#0{id}, #1{teacher_id}] aggregates=[max(#2{course_id})] Project (#0{id}..=#2{course_id}) ReadStorage materialize.public.sections Source materialize.public.sections Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW distinct_on_group_by_limit; # Illustrates that mixing of the old and new syntax for hints raises an error. statement error EXPECTED GROUP SIZE cannot be provided simultaneously with any of AGGREGATE INPUT GROUP SIZE, DISTINCT ON INPUT GROUP SIZE, or LIMIT INPUT GROUP SIZE CREATE MATERIALIZED VIEW distinct_on_group_by_limit AS SELECT DISTINCT ON(teacher_id) id, teacher_id, MAX(course_id) FROM sections GROUP BY id, teacher_id OPTIONS ( LIMIT INPUT GROUP SIZE = 50, EXPECTED GROUP SIZE = 1000) ORDER BY teacher_id, id LIMIT 2; # Illustrates that the new syntax for hints can be used with a LATERAL top-k pattern. statement ok CREATE MATERIALIZED VIEW sections_of_top_3_courses_per_teacher AS SELECT id AS teacher_id, section_id FROM teachers grp, LATERAL (SELECT id AS section_id FROM sections WHERE teacher_id = grp.id OPTIONS (LIMIT INPUT GROUP SIZE = 1000) ORDER BY course_id DESC LIMIT 3); query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW sections_of_top_3_courses_per_teacher; ---- materialize.public.sections_of_top_3_courses_per_teacher: Project (#0{id}, #2{id}) Join on=(#0{id} = #1{id}) type=differential ArrangeBy keys=[[#0{id}]] Project (#0{id}) ReadStorage materialize.public.teachers ArrangeBy keys=[[#0{id}]] Project (#0{id}, #1{id}) TopK group_by=[#0{id}] order_by=[#2{course_id} desc nulls_first] limit=3 exp_group_size=1000 Project (#0{id}, #1{id}, #3{course_id}) Join on=(#0{id} = #2{teacher_id}) type=differential ArrangeBy keys=[[#0{id}]] Distinct project=[#0{id}] Project (#0{id}) Filter (#0{id}) IS NOT NULL ReadStorage materialize.public.teachers ArrangeBy keys=[[#1{teacher_id}]] Project (#0{id}..=#2{course_id}) Filter (#1{teacher_id}) IS NOT NULL ReadStorage materialize.public.sections Source materialize.public.teachers Source materialize.public.sections filter=((#1{teacher_id}) IS NOT NULL) Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW sections_of_top_3_courses_per_teacher; # Illustrates that the new syntax for hints can be used with a LATERAL top-k pattern # and in conjunction with a min/max aggregation in the same query block. statement ok CREATE MATERIALIZED VIEW max_sections_of_top_3_courses_per_teacher AS SELECT id AS teacher_id, max_section_id FROM teachers grp, LATERAL (SELECT course_id, MAX(id) AS max_section_id FROM sections WHERE teacher_id = grp.id GROUP BY course_id OPTIONS (AGGREGATE INPUT GROUP SIZE = 1000, LIMIT INPUT GROUP SIZE = 20) ORDER BY course_id DESC LIMIT 3); query T multiline EXPLAIN OPTIMIZED PLAN WITH (humanized expressions) AS VERBOSE TEXT FOR MATERIALIZED VIEW max_sections_of_top_3_courses_per_teacher; ---- materialize.public.max_sections_of_top_3_courses_per_teacher: Project (#0{id}, #2{max_id}) Join on=(#0{id} = #1{id}) type=differential ArrangeBy keys=[[#0{id}]] Project (#0{id}) ReadStorage materialize.public.teachers ArrangeBy keys=[[#0{id}]] Project (#0{id}, #2{max_id}) TopK group_by=[#0{id}] order_by=[#1{course_id} desc nulls_first] limit=3 exp_group_size=20 Reduce group_by=[#0{id}, #2{course_id}] aggregates=[max(#1{id})] exp_group_size=1000 Project (#0{id}, #1{id}, #3{course_id}) Join on=(#0{id} = #2{teacher_id}) type=differential ArrangeBy keys=[[#0{id}]] Distinct project=[#0{id}] Project (#0{id}) Filter (#0{id}) IS NOT NULL ReadStorage materialize.public.teachers ArrangeBy keys=[[#1{teacher_id}]] Project (#0{id}..=#2{course_id}) Filter (#1{teacher_id}) IS NOT NULL ReadStorage materialize.public.sections Source materialize.public.teachers Source materialize.public.sections filter=((#1{teacher_id}) IS NOT NULL) Target cluster: quickstart EOF statement ok DROP MATERIALIZED VIEW max_sections_of_top_3_courses_per_teacher;