regex-sources.td 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. $ set-arg-default default-storage-size=1
  10. $ kafka-create-topic topic=request-log
  11. $ kafka-ingest topic=request-log format=bytes
  12. 123.17.127.5 - - [22/Jan/2020 18:59:52] "GET / HTTP/1.1" 200 -
  13. 8.15.119.56 - - [22/Jan/2020 18:59:52] "GET /detail/nNZpqxzR HTTP/1.1" 200 -
  14. 96.12.83.72 - - [22/Jan/2020 18:59:52] "GET /search/?kw=helper+ins+hennaed HTTP/1.1" 200 -
  15. # Regex explained here: https://www.debuggex.com/r/k48kBEt-lTMUZbaw
  16. > CREATE CONNECTION kafka_conn
  17. TO KAFKA (BROKER '${testdrive.kafka-addr}', SECURITY PROTOCOL PLAINTEXT);
  18. > CREATE CLUSTER regex_source_cluster SIZE '${arg.default-storage-size}';
  19. > CREATE SOURCE regex_source
  20. IN CLUSTER regex_source_cluster
  21. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  22. > CREATE TABLE regex_source_tbl
  23. FROM SOURCE regex_source (REFERENCE "testdrive-request-log-${testdrive.seed}")
  24. FORMAT REGEX '(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<ts>[^]]+)\] "(?P<path>(?:GET /search/\?kw=(?P<search_kw>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<product_detail_id>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<code>\d{3}) -'
  25. > SHOW COLUMNS FROM regex_source_tbl
  26. name nullable type comment
  27. ------------------------------------------
  28. ip true text ""
  29. ts true text ""
  30. path true text ""
  31. search_kw true text ""
  32. product_detail_id true text ""
  33. code true text ""
  34. > SELECT * FROM regex_source_tbl
  35. ip ts path search_kw product_detail_id code
  36. ---------------------------------------------------------------------------------------------------------------------------------
  37. 123.17.127.5 "22/Jan/2020 18:59:52" "GET / HTTP/1.1" <null> <null> 200
  38. 8.15.119.56 "22/Jan/2020 18:59:52" "GET /detail/nNZpqxzR HTTP/1.1" <null> nNZpqxzR 200
  39. 96.12.83.72 "22/Jan/2020 18:59:52" "GET /search/?kw=helper+ins+hennaed HTTP/1.1" helper+ins+hennaed <null> 200
  40. > SELECT search_kw FROM regex_source_tbl WHERE search_kw IS NOT NULL
  41. search_kw
  42. ------------------
  43. helper+ins+hennaed
  44. > CREATE CLUSTER regex_source_named_cols_cluster SIZE '${arg.default-storage-size}';
  45. > CREATE SOURCE regex_source_named_cols
  46. IN CLUSTER regex_source_named_cols_cluster
  47. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  48. > CREATE TABLE regex_source_named_cols_tbl (ip, ts, path, search_kw, product_detail_id, code)
  49. FROM SOURCE regex_source_named_cols (REFERENCE "testdrive-request-log-${testdrive.seed}")
  50. FORMAT REGEX '(?P<foo1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<foo2>[^]]+)\] "(?P<foo3>(?:GET /search/\?kw=(?P<foo4>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<foo5>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<foo6>\d{3}) -'
  51. > SHOW COLUMNS FROM regex_source_named_cols_tbl
  52. name nullable type comment
  53. ------------------------------------------
  54. ip true text ""
  55. ts true text ""
  56. path true text ""
  57. search_kw true text ""
  58. product_detail_id true text ""
  59. code true text ""
  60. # verify metadata column renaming
  61. > CREATE CLUSTER regex_source_renamed_cols_cluster SIZE '${arg.default-storage-size}';
  62. > CREATE SOURCE regex_source_renamed_cols
  63. IN CLUSTER regex_source_renamed_cols_cluster
  64. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  65. > CREATE TABLE regex_source_renamed_cols_tbl (ip, ts, path, search_kw, product_detail_id, code)
  66. FROM SOURCE regex_source_renamed_cols (REFERENCE "testdrive-request-log-${testdrive.seed}")
  67. FORMAT REGEX '(?P<foo1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<foo2>[^]]+)\] "(?P<foo3>(?:GET /search/\?kw=(?P<foo4>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<foo5>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<foo6>\d{3}) -'
  68. > SHOW COLUMNS FROM regex_source_renamed_cols_tbl
  69. name nullable type comment
  70. ------------------------------------------
  71. ip true text ""
  72. ts true text ""
  73. path true text ""
  74. search_kw true text ""
  75. product_detail_id true text ""
  76. code true text ""
  77. > SELECT * FROM regex_source_named_cols_tbl
  78. ip ts path search_kw product_detail_id code
  79. ---------------------------------------------------------------------------------------------------------------------------------
  80. 123.17.127.5 "22/Jan/2020 18:59:52" "GET / HTTP/1.1" <null> <null> 200
  81. 8.15.119.56 "22/Jan/2020 18:59:52" "GET /detail/nNZpqxzR HTTP/1.1" <null> nNZpqxzR 200
  82. 96.12.83.72 "22/Jan/2020 18:59:52" "GET /search/?kw=helper+ins+hennaed HTTP/1.1" helper+ins+hennaed <null> 200
  83. # Malformed regex with non-utf-8 characters
  84. $ kafka-create-topic topic=malformed-request-log
  85. $ kafka-ingest topic=malformed-request-log format=bytes
  86. 123.17.127.5 - - [22/Jan/2020 18:59:52] "GET / HTTP/1.1" 200 -
  87. 8.15.119.56 - - [22/Jan/2020 18:59:52] "GET /detail/nNZpqxzR HTTP/1.1" 200 -
  88. this line has invalid UTF-8 and will be cause dataflow errors --> \x80 <--
  89. 96.12.83.72 - - [22/Jan/2020 18:59:52] "GET /search/?kw=helper+ins+hennaed HTTP/1.1" 200 -
  90. > CREATE CLUSTER bad_regex_source_cluster SIZE '${arg.default-storage-size}';
  91. > CREATE SOURCE bad_regex_source
  92. IN CLUSTER bad_regex_source_cluster
  93. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-malformed-request-log-${testdrive.seed}')
  94. > CREATE TABLE bad_regex_source_tbl
  95. FROM SOURCE bad_regex_source (REFERENCE "testdrive-malformed-request-log-${testdrive.seed}")
  96. FORMAT REGEX '(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<ts>[^]]+)\] "(?P<path>(?:GET /search/\?kw=(?P<search_kw>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<product_detail_id>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<code>\d{3}) -'
  97. ! SELECT * FROM bad_regex_source_tbl
  98. contains:UTF-8