Berry now accepts 'bytes()' as precompiled patterns, added 're.compilebytes()' (#23149)

2025-07-12 13:26:31 +00:00 · 2025-03-13 23:38:55 +01:00 · 2025-03-13 23:38:55 +01:00 · 1b51aef911
commit 1b51aef911
parent 6486ba3b26
5 changed files with 616 additions and 574 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
 - ESP32 enable webcam version 2 (#18732)
 - ESP8266 enable FTP for >= 4MB variants (#23120)
 - Berry update flasher for Sonoff ZBBridge Pro (#23136)
+- Berry `re` now accepts `bytes()` as precompiled patterns, added `re.compilebytes()`

 ### Fixed
 - Berry prevent `import` from hiding a solidified class (#23112)
--- a/lib/libesp32/berry/default/be_re_lib.c
+++ b/lib/libesp32/berry/default/be_re_lib.c
@ -64,6 +64,25 @@ int be_re_compile(bvm *vm) {
  be_raise(vm, "type_error", NULL);
 }

+// Native functions be_const_func()
+// Berry: `re.compilebytes(pattern:string) -> instance(bytes)`
+int be_re_compilebytes(bvm *vm) {
+  int32_t argc = be_top(vm); // Get the number of arguments
+  if (argc >= 1 && be_isstring(vm, 1)) {
+    const char * regex_str = be_tostring(vm, 1);
+    int sz = re1_5_sizecode(regex_str);
+    if (sz < 0) {
+      be_raise(vm, "internal_error", "error in regex");
+    }
+
+    be_pushbytes(vm, NULL, sizeof(ByteProg) + sz);    
+    ByteProg *code = (ByteProg*) be_tobytes(vm, -1, NULL);
+    re1_5_compilecode(code, regex_str);
+    be_return(vm);
+  }
+  be_raise(vm, "type_error", NULL);
+}
+
 // pushes either a list if matched, else `nil`
 // return index of next offset, or -1 if not found
 const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored, bbool size_only) {
@ -99,9 +118,10 @@ const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbo

 int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
  int32_t argc = be_top(vm); // Get the number of arguments
-  if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
-    const char * regex_str = be_tostring(vm, 1);
+  if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
    const char * hay = be_tostring(vm, 2);
+    ByteProg *code = NULL;
+
    int32_t offset = 0;
    if (argc >= 3 && be_isint(vm, 3)) {
      offset = be_toint(vm, 3);
@ -111,22 +131,31 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
    if (offset >= hay_len) { be_return_nil(vm); }      // any match of empty string returns nil, this catches implicitly when hay_len == 0
    hay += offset;                  // shift to offset

-    int sz = re1_5_sizecode(regex_str);
-    if (sz < 0) {
-      be_raise(vm, "internal_error", "error in regex");
-    }
+    if (be_isstring(vm, 1)) {
+      const char * regex_str = be_tostring(vm, 1);
+      int sz = re1_5_sizecode(regex_str);
+      if (sz < 0) {
+        be_raise(vm, "internal_error", "error in regex");
+      }

-    ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
-    if (code == NULL) {
-      be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
-    }
-    int ret = re1_5_compilecode(code, regex_str);
-    if (ret != 0) {
-      be_os_free(code);
-      be_raise(vm, "internal_error", "error in regex");
+      code = be_os_malloc(sizeof(ByteProg) + sz);
+      if (code == NULL) {
+        be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
+      }
+      int ret = re1_5_compilecode(code, regex_str);
+      if (ret != 0) {
+        be_os_free(code);
+        be_raise(vm, "internal_error", "error in regex");
+      }
+    } else {
+      code = (ByteProg *) be_tobytes(vm, 1, NULL);
    }
+    // do the match
    be_re_match_search_run(vm, code, hay, is_anchored, size_only);
-    be_os_free(code);
+    // cleanup
+    if (be_isstring(vm, 1)) {
+      be_os_free(code);
+    }
    be_return(vm);
  }
  be_raise(vm, "type_error", NULL);
@ -134,26 +163,32 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {

 int be_re_match_search_all(bvm *vm, bbool is_anchored) {
  int32_t argc = be_top(vm); // Get the number of arguments
-  if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
-    const char * regex_str = be_tostring(vm, 1);
+  if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
    const char * hay = be_tostring(vm, 2);
+    ByteProg *code = NULL;
    int limit = -1;
    if (argc >= 3) {
      limit = be_toint(vm, 3);
    }
-    int sz = re1_5_sizecode(regex_str);
-    if (sz < 0) {
-      be_raise(vm, "internal_error", "error in regex");
-    }

-    ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
-    if (code == NULL) {
-      be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
-    }
-    int ret = re1_5_compilecode(code, regex_str);
-    if (ret != 0) {
-      be_os_free(code);
-      be_raise(vm, "internal_error", "error in regex");
+    if (be_isstring(vm, 1)) {
+      const char * regex_str = be_tostring(vm, 1);
+      int sz = re1_5_sizecode(regex_str);
+      if (sz < 0) {
+        be_raise(vm, "internal_error", "error in regex");
+      }
+
+      code = be_os_malloc(sizeof(ByteProg) + sz);
+      if (code == NULL) {
+        be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
+      }
+      int ret = re1_5_compilecode(code, regex_str);
+      if (ret != 0) {
+        be_os_free(code);
+        be_raise(vm, "internal_error", "error in regex");
+      }
+    } else {
+      code = (ByteProg *) be_tobytes(vm, 1, NULL);
    }

    be_newobject(vm, "list");
@ -165,7 +200,10 @@ int be_re_match_search_all(bvm *vm, bbool is_anchored) {
      be_pop(vm, 1);
    }
    be_pop(vm, 1);
-    be_os_free(code);
+    // cleanup
+    if (be_isstring(vm, 1)) {
+      be_os_free(code);
+    }
    be_return(vm);
  }
  be_raise(vm, "type_error", NULL);
@ -329,29 +367,36 @@ int re_pattern_split(bvm *vm) {
 // Berry: `re.split(pattern:string, s:string [, split_limit:int]) -> list(string)`
 int be_re_split(bvm *vm) {
  int32_t argc = be_top(vm); // Get the number of arguments
-  if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
-    const char * regex_str = be_tostring(vm, 1);
+  if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
    const char * hay = be_tostring(vm, 2);
+    ByteProg *code = NULL;
    int split_limit = -1;
    if (argc >= 3) {
      split_limit = be_toint(vm, 3);
    }
-    int sz = re1_5_sizecode(regex_str);
-    if (sz < 0) {
-      be_raise(vm, "internal_error", "error in regex");
-    }
+    if (be_isstring(vm, 1)) {
+      const char * regex_str = be_tostring(vm, 1);
+      int sz = re1_5_sizecode(regex_str);
+      if (sz < 0) {
+        be_raise(vm, "internal_error", "error in regex");
+      }

-    ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
-    if (code == NULL) {
-      be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
+      code = be_os_malloc(sizeof(ByteProg) + sz);
+      if (code == NULL) {
+        be_throw(vm, BE_MALLOC_FAIL);   /* lack of heap space */
+      }
+      int ret = re1_5_compilecode(code, regex_str);
+      if (ret != 0) {
+        be_os_free(code);
+        be_raise(vm, "internal_error", "error in regex");
+      }
+    } else {
+      code = (ByteProg *) be_tobytes(vm, 1, NULL);
    }
-    int ret = re1_5_compilecode(code, regex_str);
-    if (ret != 0) {
+    int ret = re_pattern_split_run(vm, code, hay, split_limit);
+    if (be_isstring(vm, 1)) {
      be_os_free(code);
-      be_raise(vm, "internal_error", "error in regex");
    }
-    ret = re_pattern_split_run(vm, code, hay, split_limit);
-    be_os_free(code);
    return ret;
  }
  be_raise(vm, "type_error", NULL);
@ -363,6 +408,7 @@ int be_re_split(bvm *vm) {
@const_object_info_begin
 module re (scope: global) {
  compile, func(be_re_compile)
+  compilebytes, func(be_re_compilebytes)
  search, func(be_re_search)
  searchall, func(be_re_search_all)
  match, func(be_re_match)
--- a/lib/libesp32/berry/tests/re.be
+++ b/lib/libesp32/berry/tests/re.be
@ -0,0 +1,52 @@
+# test regex from re1.5
+import re
+
+# standard use of lib
+assert(re.search("a.*?b(z+)", "zaaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+assert(re.searchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+
+assert(re.match("a.*?b(z+)", "aaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+assert(re.match2("a.*?b(z+)", "aaaabbbccbbzzzee")  ==  [14, 'zzz'])
+assert(re.matchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>')  ==  [['<abc>', 'abc']])
+assert(re.matchall('<([a-zA-Z]+)>', '<abc><xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+assert(re.split('/', "foo/bar//baz")  ==  ['foo', 'bar', '', 'baz'])
+
+# pre-compile
+var rr
+rr = re.compile("a.*?b(z+)")
+assert(rr.search("zaaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+rr = re.compile('<([a-zA-Z]+)>')
+assert(rr.searchall('<abc> yeah <xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+
+rr = re.compile("a.*?b(z+)")
+assert(rr.match("aaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+assert(rr.match2("aaaabbbccbbzzzee")  ==  [14, 'zzz'])
+rr = re.compile('<([a-zA-Z]+)>')
+assert(rr.matchall('<abc> yeah <xyz>')  ==  [['<abc>', 'abc']])
+assert(rr.matchall('<abc><xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+rr = re.compile('/')
+assert(rr.split("foo/bar//baz")  ==  ['foo', 'bar', '', 'baz'])
+
+# compile to bytes
+var rb
+rb = re.compilebytes("a.*?b(z+)")
+assert(re.search(rb, "zaaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
+
+rb = re.compilebytes('<([a-zA-Z]+)>')
+assert(re.searchall(rb, '<abc> yeah <xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
+
+rb = re.compilebytes("a.*?b(z+)")
+assert(re.match(rb, "aaaabbbccbbzzzee")  ==  ['aaaabbbccbbzzz', 'zzz'])
+assert(re.match2(rb, "aaaabbbccbbzzzee")  ==  [14, 'zzz'])
+assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
+
+rb = re.compilebytes('<([a-zA-Z]+)>')
+assert(re.matchall(rb, '<abc> yeah <xyz>')  ==  [['<abc>', 'abc']])
+assert(re.matchall(rb, '<abc><xyz>')  ==  [['<abc>', 'abc'], ['<xyz>', 'xyz']])
+assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
+
+rb = re.compilebytes('/')
+assert(re.split(rb, "foo/bar//baz")  ==  ['foo', 'bar', '', 'baz'])
+assert(rb == bytes('0C000000070000000000000062030260FB7E00012F7E017F'))
--- a/lib/libesp32/berry_tasmota/src/embedded/webserver_async.be
+++ b/lib/libesp32/berry_tasmota/src/embedded/webserver_async.be
@ -238,7 +238,10 @@ class webserver_async
    # pre: self.buf_in is not empty
    # post: self.buf_in has made progress (smaller or '')
    def parse_http_req_line()
-      var m = global._re_http_srv.match2(self.buf_in, self.buf_in_offset)
+      import re
+      # print("parse_http_req_line", "self.buf_in=", self.buf_in)
+      var m = re.match2(self.server.re_http_srv, self.buf_in, self.buf_in_offset)
+      # print(f"{m=}")
      # Ex: "GET / HTTP/1.1\r\n"
      if m
        var offset = m[0]
@ -261,16 +264,18 @@ class webserver_async
    #############################################################
    # parse incoming headers
    def parse_http_headers()
+      import re
      while true
        # print("parse_http_headers", "self.buf_in_offset=", self.buf_in_offset)
-        var m = global._re_http_srv_header.match2(self.buf_in, self.buf_in_offset)
-        # print("m=", m)
+        var m = re.match2(self.server.re_http_srv_header, self.buf_in, self.buf_in_offset)
+        # print(f"{m=}")
        # Ex: [32, 'Content-Type', 'application/json']
        if m
          self.event_http_header(m[1], m[2])
          self.buf_in_offset += m[0]
        else  # no more headers
-          var m2 = global._re_http_srv_body.match2(self.buf_in, self.buf_in_offset)
+          var m2 = re.match2(self.server.re_http_srv_body, self.buf_in, self.buf_in_offset)
+          # print(f"{m2=}")
          if m2
            # end of headers
            # we keep \r\n which is used by pattern
@ -519,9 +524,16 @@ class webserver_async
  var p1                                          # temporary object bytes() to avoid reallocation

  # static var TIMEOUT = 1000                       # default timeout: 1000ms
+
+  #############################################################
+  # pre-compile REGEX
+  #
  # static var HTTP_REQ = "^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n"
  # static var HTTP_HEADER_REGEX = "([A-Za-z0-9-]+): (.*?)\r\n"       # extract a header with its 2 parts
  # static var HTTP_BODY_REGEX   = "\r\n"                             # end of headers
+  static var re_http_srv          = re.compilebytes("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
+  static var re_http_srv_header   = re.compilebytes("([A-Za-z0-9-]+): (.*?)\r\n")
+  static var re_http_srv_body     = re.compilebytes("\r\n")

  #############################################################
  # init
@ -535,27 +547,12 @@ class webserver_async
    self.cors = false
    self.p1 = bytes(100)              # reserve 100 bytes by default
    # TODO what about max_clients ?
-    self.compile_re()
    # register cb
    tasmota.add_driver(self)
    self.fastloop_cb = def () self.loop() end
    tasmota.add_fast_loop(self.fastloop_cb)
  end

-  #############################################################
-  # compile once for all the regex
-  def compile_re()
-    import re
-    if !global.contains("_re_http_srv")
-      # global._re_http_srv         = re.compile(self.HTTP_REQ)
-      # global._re_http_srv_header  = re.compile(self.HTTP_HEADER_REGEX)
-      # global._re_http_srv_body   = re.compile(self.HTTP_BODY_REGEX)
-      global._re_http_srv         = re.compile("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
-      global._re_http_srv_header  = re.compile("([A-Za-z0-9-]+): (.*?)\r\n")
-      global._re_http_srv_body   = re.compile("\r\n")
-    end
-  end
-
  #############################################################
  # enable or disable chunked mode (enabled by default)
  def set_chunked(chunked)
--- a/lib/libesp32/berry_tasmota/src/solidify/solidified_webserver_async.h
+++ b/lib/libesp32/berry_tasmota/src/solidify/solidified_webserver_async.h