regex-sources.td 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. $ set-arg-default default-storage-size=1
  10. $ kafka-create-topic topic=request-log
  11. $ kafka-ingest topic=request-log format=bytes
  12. 123.17.127.5 - - [22/Jan/2020 18:59:52] "GET / HTTP/1.1" 200 -
  13. 8.15.119.56 - - [22/Jan/2020 18:59:52] "GET /detail/nNZpqxzR HTTP/1.1" 200 -
  14. 96.12.83.72 - - [22/Jan/2020 18:59:52] "GET /search/?kw=helper+ins+hennaed HTTP/1.1" 200 -
  15. # Regex explained here: https://www.debuggex.com/r/k48kBEt-lTMUZbaw
  16. > CREATE CONNECTION kafka_conn
  17. TO KAFKA (BROKER '${testdrive.kafka-addr}', SECURITY PROTOCOL PLAINTEXT);
  18. > CREATE CLUSTER regex_source_cluster SIZE '${arg.default-storage-size}';
  19. > CREATE SOURCE regex_source
  20. IN CLUSTER regex_source_cluster
  21. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  22. FORMAT REGEX '(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<ts>[^]]+)\] "(?P<path>(?:GET /search/\?kw=(?P<search_kw>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<product_detail_id>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<code>\d{3}) -'
  23. > SHOW COLUMNS FROM regex_source
  24. name nullable type comment
  25. ------------------------------------------
  26. ip true text ""
  27. ts true text ""
  28. path true text ""
  29. search_kw true text ""
  30. product_detail_id true text ""
  31. code true text ""
  32. > SELECT * FROM regex_source
  33. ip ts path search_kw product_detail_id code
  34. ---------------------------------------------------------------------------------------------------------------------------------
  35. 123.17.127.5 "22/Jan/2020 18:59:52" "GET / HTTP/1.1" <null> <null> 200
  36. 8.15.119.56 "22/Jan/2020 18:59:52" "GET /detail/nNZpqxzR HTTP/1.1" <null> nNZpqxzR 200
  37. 96.12.83.72 "22/Jan/2020 18:59:52" "GET /search/?kw=helper+ins+hennaed HTTP/1.1" helper+ins+hennaed <null> 200
  38. > SELECT search_kw FROM regex_source WHERE search_kw IS NOT NULL
  39. search_kw
  40. ------------------
  41. helper+ins+hennaed
  42. > CREATE CLUSTER regex_source_named_cols_cluster SIZE '${arg.default-storage-size}';
  43. > CREATE SOURCE regex_source_named_cols (ip, ts, path, search_kw, product_detail_id, code)
  44. IN CLUSTER regex_source_named_cols_cluster
  45. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  46. FORMAT REGEX '(?P<foo1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<foo2>[^]]+)\] "(?P<foo3>(?:GET /search/\?kw=(?P<foo4>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<foo5>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<foo6>\d{3}) -'
  47. > SHOW COLUMNS FROM regex_source_named_cols
  48. name nullable type comment
  49. ------------------------------------------
  50. ip true text ""
  51. ts true text ""
  52. path true text ""
  53. search_kw true text ""
  54. product_detail_id true text ""
  55. code true text ""
  56. # verify metadata column renaming
  57. > CREATE CLUSTER regex_source_renamed_cols_cluster SIZE '${arg.default-storage-size}';
  58. > CREATE SOURCE regex_source_renamed_cols (ip, ts, path, search_kw, product_detail_id, code)
  59. IN CLUSTER regex_source_renamed_cols_cluster
  60. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-request-log-${testdrive.seed}')
  61. FORMAT REGEX '(?P<foo1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<foo2>[^]]+)\] "(?P<foo3>(?:GET /search/\?kw=(?P<foo4>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<foo5>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<foo6>\d{3}) -'
  62. > SHOW COLUMNS FROM regex_source_renamed_cols
  63. name nullable type comment
  64. ------------------------------------------
  65. ip true text ""
  66. ts true text ""
  67. path true text ""
  68. search_kw true text ""
  69. product_detail_id true text ""
  70. code true text ""
  71. > SELECT * FROM regex_source_named_cols
  72. ip ts path search_kw product_detail_id code
  73. ---------------------------------------------------------------------------------------------------------------------------------
  74. 123.17.127.5 "22/Jan/2020 18:59:52" "GET / HTTP/1.1" <null> <null> 200
  75. 8.15.119.56 "22/Jan/2020 18:59:52" "GET /detail/nNZpqxzR HTTP/1.1" <null> nNZpqxzR 200
  76. 96.12.83.72 "22/Jan/2020 18:59:52" "GET /search/?kw=helper+ins+hennaed HTTP/1.1" helper+ins+hennaed <null> 200
  77. # Malformed regex with non-utf-8 characters
  78. $ kafka-create-topic topic=malformed-request-log
  79. $ kafka-ingest topic=malformed-request-log format=bytes
  80. 123.17.127.5 - - [22/Jan/2020 18:59:52] "GET / HTTP/1.1" 200 -
  81. 8.15.119.56 - - [22/Jan/2020 18:59:52] "GET /detail/nNZpqxzR HTTP/1.1" 200 -
  82. this line has invalid UTF-8 and will be cause dataflow errors --> \x80 <--
  83. 96.12.83.72 - - [22/Jan/2020 18:59:52] "GET /search/?kw=helper+ins+hennaed HTTP/1.1" 200 -
  84. > CREATE CLUSTER bad_regex_source_cluster SIZE '${arg.default-storage-size}';
  85. > CREATE SOURCE bad_regex_source
  86. IN CLUSTER bad_regex_source_cluster
  87. FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-malformed-request-log-${testdrive.seed}')
  88. FORMAT REGEX '(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<ts>[^]]+)\] "(?P<path>(?:GET /search/\?kw=(?P<search_kw>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<product_detail_id>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<code>\d{3}) -'
  89. ! SELECT * FROM bad_regex_source
  90. contains:UTF-8