kafka-headers.td 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. # As of the Change Date specified in that file, in accordance with
  6. # the Business Source License, use of this software will be governed
  7. # by the Apache License, Version 2.0.
  8. $ set-arg-default default-storage-size=1
  9. $ set-arg-default single-replica-cluster=quickstart
  10. $ set keyschema={
  11. "type": "record",
  12. "name": "Key",
  13. "fields": [
  14. {"name": "key", "type": "string"}
  15. ]
  16. }
  17. $ set schema={
  18. "type" : "record",
  19. "name" : "test",
  20. "fields" : [
  21. {"name":"f1", "type":"string"},
  22. {"name":"f2", "type":"long"}
  23. ]
  24. }
  25. $ kafka-create-topic topic=headers_src
  26. # [103, 117, 115, 51] = "gus3"
  27. $ kafka-ingest format=avro topic=headers_src key-format=avro key-schema=${keyschema} schema=${schema} headers={"gus": "gusfive", "gus2": [103, 117, 115, 51]}
  28. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  29. $ kafka-ingest format=avro topic=headers_src key-format=avro key-schema=${keyschema} schema=${schema}
  30. {"key": "fish2"} {"f1": "fishval", "f2": 1000}
  31. > CREATE CONNECTION kafka_conn
  32. TO KAFKA (BROKER '${testdrive.kafka-addr}', SECURITY PROTOCOL PLAINTEXT);
  33. > CREATE CLUSTER headers_src_cluster SIZE '${arg.default-storage-size}';
  34. > CREATE SOURCE headers_src
  35. IN CLUSTER headers_src_cluster
  36. FROM KAFKA CONNECTION kafka_conn (TOPIC
  37. 'testdrive-headers_src-${testdrive.seed}')
  38. > CREATE TABLE headers_src_tbl FROM SOURCE headers_src (REFERENCE "testdrive-headers_src-${testdrive.seed}")
  39. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  40. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  41. INCLUDE HEADERS
  42. ENVELOPE UPSERT
  43. # empty case + has headers case
  44. > SELECT key, f1, f2, list_length(headers), headers::text from headers_src_tbl
  45. key f1 f2 list_length headers
  46. ----------------------------------------------
  47. fish fishval 1000 2 "{\"(gus,\\\"\\\\\\\\x67757366697665\\\")\",\"(gus2,\\\"\\\\\\\\x67757333\\\")\"}"
  48. fish2 fishval 1000 0 "{}"
  49. # unpacking works
  50. > SELECT key, f1, f2, headers[1].value as gus from headers_src_tbl
  51. key f1 f2 gus
  52. -------------------------------------------
  53. fish fishval 1000 gusfive
  54. fish2 fishval 1000 <null>
  55. # map_build lets you get the headers as a map
  56. > SELECT key, f1, f2, map_build(headers)->'gus' as gus, map_build(headers)->'gus2' AS gus2 from headers_src_tbl;
  57. key f1 f2 gus gus2
  58. -------------------------------------------
  59. fish fishval 1000 gusfive gus3
  60. fish2 fishval 1000 <null> <null>
  61. # selecting by key works
  62. > SELECT key, f1, f2, thekey, value FROM (SELECT i.key, i.f1, i.f2, unnest(headers).key as thekey, unnest(headers).value as value from headers_src_tbl as I) i WHERE thekey = 'gus'
  63. key f1 f2 thekey value
  64. -------------------------------------------
  65. fish fishval 1000 gus gusfive
  66. # The headers dict is entirely overwritten, even if the value AND the remaining header hasn't changed
  67. $ kafka-ingest format=avro topic=headers_src key-format=avro key-schema=${keyschema} schema=${schema} headers={"gus":"gusfive"}
  68. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  69. # empty case + has headers case
  70. > SELECT key, f1, f2, list_length(headers) from headers_src_tbl
  71. key f1 f2 list_length
  72. -------------------------------------------
  73. fish fishval 1000 1
  74. fish2 fishval 1000 0
  75. # Headers with the same key are both preserved
  76. $ kafka-ingest format=avro topic=headers_src key-format=avro key-schema=${keyschema} schema=${schema} headers=[{"gus": "a"}, {"gus": "b"}]
  77. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  78. > SELECT key, f1, f2, headers[1].value as gus1, headers[2].value as gus2 from headers_src_tbl
  79. key f1 f2 gus1 gus2
  80. -------------------------------------------
  81. fish fishval 1000 a b
  82. fish2 fishval 1000 <null> <null>
  83. # Works with other includes
  84. $ kafka-create-topic topic=headers_also partitions=1
  85. $ kafka-ingest format=avro topic=headers_also key-format=avro key-schema=${keyschema} schema=${schema} headers={"gus":"gusfive"}
  86. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  87. > CREATE CLUSTER headers_also_cluster SIZE '${arg.default-storage-size}';
  88. > CREATE SOURCE headers_also
  89. IN CLUSTER headers_also_cluster
  90. FROM KAFKA CONNECTION kafka_conn (TOPIC
  91. 'testdrive-headers_also-${testdrive.seed}')
  92. > CREATE TABLE headers_also_tbl FROM SOURCE headers_also (REFERENCE "testdrive-headers_also-${testdrive.seed}")
  93. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  94. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  95. INCLUDE HEADERS, PARTITION
  96. ENVELOPE UPSERT
  97. > SELECT key, f1, f2, list_length(headers), partition from headers_also_tbl
  98. key f1 f2 list_length partition
  99. -----------------------------------------------
  100. fish fishval 1000 1 0
  101. # esoteric ingestions
  102. $ kafka-ingest format=avro topic=headers_also key-format=avro key-schema=${keyschema} schema=${schema} headers={"gus": "gus=five"}
  103. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  104. > SELECT key, f1, f2, headers[1].value as gus, partition from headers_also_tbl
  105. key f1 f2 gus partition
  106. -----------------------------------------------
  107. fish fishval 1000 gus=five 0
  108. # null header
  109. $ kafka-ingest format=avro topic=headers_also key-format=avro key-schema=${keyschema} schema=${schema} headers={"gus": null}
  110. {"key": "fish"} {"f1": "fishval", "f2": 1000}
  111. > SELECT key, f1, f2, headers[1].value as gus, partition from headers_also_tbl
  112. key f1 f2 gus partition
  113. -----------------------------------------------
  114. fish fishval 1000 <null> 0
  115. # conflicting naming
  116. $ set schemaheaders={
  117. "type" : "record",
  118. "name" : "test",
  119. "fields" : [
  120. {"name":"headers", "type":"string"}
  121. ]
  122. }
  123. $ kafka-create-topic topic=headers_conflict
  124. > CREATE SOURCE headers_conflict
  125. IN CLUSTER ${arg.single-replica-cluster}
  126. FROM KAFKA CONNECTION kafka_conn (TOPIC
  127. 'testdrive-headers_conflict-${testdrive.seed}')
  128. ! CREATE TABLE headers_conflict_tbl
  129. FROM SOURCE headers_conflict (REFERENCE "testdrive-headers_conflict-${testdrive.seed}")
  130. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  131. VALUE FORMAT AVRO USING SCHEMA '${schemaheaders}'
  132. INCLUDE HEADERS
  133. ENVELOPE UPSERT
  134. contains: column "headers" specified more than once
  135. # No meaningful way to get data out in td because of the ambiguous name
  136. # + weird type
  137. # > SELECT * from headers_conflict_tbl
  138. $ kafka-ingest format=avro topic=headers_conflict key-format=avro key-schema=${keyschema} schema=${schemaheaders} headers={"gus": "gusfive"}
  139. {"key": "fish"} {"headers": "value"}
  140. # using AS to resolve it!
  141. > CREATE CLUSTER headers_conflict2_cluster SIZE '${arg.default-storage-size}';
  142. > CREATE SOURCE headers_conflict2
  143. IN CLUSTER headers_conflict2_cluster
  144. FROM KAFKA CONNECTION kafka_conn (TOPIC
  145. 'testdrive-headers_conflict-${testdrive.seed}')
  146. > CREATE TABLE headers_conflict2_tbl FROM SOURCE headers_conflict2 (REFERENCE "testdrive-headers_conflict-${testdrive.seed}")
  147. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  148. VALUE FORMAT AVRO USING SCHEMA '${schemaheaders}'
  149. INCLUDE HEADERS AS kafka_headers
  150. ENVELOPE UPSERT
  151. > SELECT key, headers, kafka_headers[1].value as gus from headers_conflict2_tbl
  152. key headers gus
  153. ------------------------
  154. fish value gusfive
  155. # test extracting individual headers with INCLUDE HEADER
  156. $ kafka-create-topic topic=individual_headers
  157. $ kafka-ingest format=avro topic=individual_headers key-format=avro key-schema=${keyschema} schema=${schema} headers={"header1": "message_1_header_1"}
  158. {"key": "message_1"} {"f1": "fishval", "f2": 1000}
  159. $ kafka-ingest format=avro topic=individual_headers key-format=avro key-schema=${keyschema} schema=${schema} headers={"header1": "message_2_header_1", "header2": "message_2_header_2"}
  160. {"key": "message_2"} {"f1": "fishval", "f2": 1000}
  161. > CREATE CLUSTER individual_headers_cluster SIZE '${arg.default-storage-size}';
  162. > CREATE SOURCE individual_headers
  163. IN CLUSTER individual_headers_cluster
  164. FROM KAFKA CONNECTION kafka_conn (TOPIC
  165. 'testdrive-individual_headers-${testdrive.seed}')
  166. > CREATE TABLE individual_headers_tbl FROM SOURCE individual_headers (REFERENCE "testdrive-individual_headers-${testdrive.seed}")
  167. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  168. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  169. INCLUDE HEADER 'header1' AS header1
  170. ENVELOPE UPSERT
  171. > SELECT key, header1 from individual_headers_tbl
  172. key header1
  173. -------------------------------
  174. message_1 message_1_header_1
  175. message_2 message_2_header_1
  176. # test exposing header as byte array
  177. > CREATE CLUSTER individual_headers_bytes_cluster SIZE '${arg.default-storage-size}';
  178. > CREATE SOURCE individual_headers_bytes
  179. IN CLUSTER individual_headers_bytes_cluster
  180. FROM KAFKA CONNECTION kafka_conn (TOPIC
  181. 'testdrive-individual_headers-${testdrive.seed}')
  182. > CREATE TABLE individual_headers_bytes_tbl FROM SOURCE individual_headers_bytes (REFERENCE "testdrive-individual_headers-${testdrive.seed}")
  183. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  184. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  185. INCLUDE HEADER 'header1' AS header1 BYTES
  186. ENVELOPE UPSERT
  187. > SELECT key, header1::text from individual_headers_bytes_tbl
  188. key header1
  189. ---------------------------------------------------
  190. message_1 \x6d6573736167655f315f6865616465725f31
  191. message_2 \x6d6573736167655f325f6865616465725f31
  192. # When there are multiple headers with identical keys, verify that the last header is exposed in the row
  193. $ kafka-create-topic topic=duplicate_individual_headers
  194. $ kafka-ingest format=avro topic=duplicate_individual_headers key-format=avro key-schema=${keyschema} schema=${schema} headers=[{"duplicates": "message_3_header_3_first"}, {"duplicates": "message_3_header_3_second"}, {"duplicates": "message_3_header_3_third"}]
  195. {"key": "message_3"} {"f1": "fishval", "f2": 1000}
  196. > CREATE CLUSTER duplicate_individual_headers_cluster SIZE '${arg.default-storage-size}';
  197. > CREATE SOURCE duplicate_individual_headers
  198. IN CLUSTER duplicate_individual_headers_cluster
  199. FROM KAFKA CONNECTION kafka_conn (TOPIC
  200. 'testdrive-duplicate_individual_headers-${testdrive.seed}')
  201. > CREATE TABLE duplicate_individual_headers_tbl FROM SOURCE duplicate_individual_headers (REFERENCE "testdrive-duplicate_individual_headers-${testdrive.seed}")
  202. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  203. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  204. INCLUDE HEADERS, HEADER 'duplicates' AS duplicates
  205. ENVELOPE UPSERT
  206. > SELECT key, duplicates, headers::text from duplicate_individual_headers_tbl
  207. key duplicates headers
  208. -------------------------------------------------
  209. message_3 message_3_header_3_third "{\"(duplicates,\\\"\\\\\\\\x6d6573736167655f335f6865616465725f335f6669727374\\\")\",\"(duplicates,\\\"\\\\\\\\x6d6573736167655f335f6865616465725f335f7365636f6e64\\\")\",\"(duplicates,\\\"\\\\\\\\x6d6573736167655f335f6865616465725f335f7468697264\\\")\"}"
  210. # We can control the header map more granularly with `map_agg`.
  211. > SELECT
  212. key,
  213. map_agg((headers).key, convert_from((headers).value, 'utf-8') ORDER BY (headers).value ASC)::TEXT AS headers_map
  214. FROM (
  215. SELECT
  216. key, unnest(headers) AS headers
  217. FROM duplicate_individual_headers_tbl
  218. )
  219. GROUP BY key;
  220. key headers_map
  221. ---------------------
  222. message_3 "{duplicates=>message_3_header_3_third}"
  223. # Reverse order of aggregating values.
  224. > SELECT
  225. key,
  226. map_agg((headers).key, convert_from((headers).value, 'utf-8') ORDER BY (headers).value DESC)::TEXT AS headers_map
  227. FROM (
  228. SELECT
  229. key, unnest(headers) AS headers
  230. FROM duplicate_individual_headers_tbl
  231. )
  232. GROUP BY key;
  233. key headers_map
  234. ---------------------
  235. message_3 "{duplicates=>message_3_header_3_first}"
  236. # Verify that the source is bricked when there are headers that cannot be parsed as utf-8
  237. $ kafka-create-topic topic=ill_formed_header
  238. $ kafka-ingest format=avro topic=ill_formed_header key-format=avro key-schema=${keyschema} schema=${schema} headers={"header1": "message_1_header_1"}
  239. {"key": "message_1"} {"f1": "fishval", "f2": 1000}
  240. $ kafka-ingest format=avro topic=ill_formed_header key-format=avro key-schema=${keyschema} schema=${schema} headers={"header1": "message_2_header_1"}
  241. {"key": "message_2"} {"f1": "fishval", "f2": 1000}
  242. > CREATE CLUSTER ill_formed_header_cluster SIZE '${arg.default-storage-size}';
  243. > CREATE SOURCE ill_formed_header
  244. IN CLUSTER ill_formed_header_cluster
  245. FROM KAFKA CONNECTION kafka_conn (TOPIC
  246. 'testdrive-ill_formed_header-${testdrive.seed}')
  247. > CREATE TABLE ill_formed_header_tbl FROM SOURCE ill_formed_header (REFERENCE "testdrive-ill_formed_header-${testdrive.seed}")
  248. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  249. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  250. INCLUDE HEADERS, HEADER 'header1' AS header1
  251. ENVELOPE UPSERT
  252. > SELECT key, header1 from ill_formed_header_tbl
  253. key header1
  254. ------------------------------------------
  255. message_1 message_1_header_1
  256. message_2 message_2_header_1
  257. $ kafka-ingest format=avro topic=ill_formed_header key-format=avro key-schema=${keyschema} schema=${schema} headers={"header1": [195, 40]}
  258. {"key": "message_1"} {"f1": "fishval", "f2": 1000}
  259. ! SELECT key, header1 from ill_formed_header_tbl
  260. contains:Found ill-formed byte sequence in header 'header1' that cannot be decoded as valid utf-8 (original bytes: [c3, 28])
  261. # Verify that the source is bricked when messages have missing headers
  262. > CREATE CLUSTER missing_headers_cluster SIZE '${arg.default-storage-size}';
  263. > CREATE SOURCE missing_headers
  264. IN CLUSTER missing_headers_cluster
  265. FROM KAFKA CONNECTION kafka_conn (TOPIC
  266. 'testdrive-individual_headers-${testdrive.seed}')
  267. > CREATE TABLE missing_headers_tbl FROM SOURCE missing_headers (REFERENCE "testdrive-individual_headers-${testdrive.seed}")
  268. KEY FORMAT AVRO USING SCHEMA '${keyschema}'
  269. VALUE FORMAT AVRO USING SCHEMA '${schema}'
  270. INCLUDE HEADER 'header2' AS header2
  271. ENVELOPE UPSERT
  272. ! SELECT key, header2 from missing_headers_tbl
  273. contains:A header with key 'header2' was not found in the message headers