Berry now accepts 'bytes()' as precompiled patterns, added 're.compilebytes()' (#23149)

This commit is contained in:
s-hadinger 2025-03-13 23:38:55 +01:00 committed by GitHub
parent 6486ba3b26
commit 1b51aef911
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 616 additions and 574 deletions

View File

@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
- ESP32 enable webcam version 2 (#18732)
- ESP8266 enable FTP for >= 4MB variants (#23120)
- Berry update flasher for Sonoff ZBBridge Pro (#23136)
- Berry `re` now accepts `bytes()` as precompiled patterns, added `re.compilebytes()`
### Fixed
- Berry prevent `import` from hiding a solidified class (#23112)

View File

@ -64,6 +64,25 @@ int be_re_compile(bvm *vm) {
be_raise(vm, "type_error", NULL);
}
// Native functions be_const_func()
// Berry: `re.compilebytes(pattern:string) -> instance(bytes)`
int be_re_compilebytes(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 1 && be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
be_pushbytes(vm, NULL, sizeof(ByteProg) + sz);
ByteProg *code = (ByteProg*) be_tobytes(vm, -1, NULL);
re1_5_compilecode(code, regex_str);
be_return(vm);
}
be_raise(vm, "type_error", NULL);
}
// pushes either a list if matched, else `nil`
// return index of next offset, or -1 if not found
const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored, bbool size_only) {
@ -99,9 +118,10 @@ const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbo
int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int32_t offset = 0;
if (argc >= 3 && be_isint(vm, 3)) {
offset = be_toint(vm, 3);
@ -111,22 +131,31 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
if (offset >= hay_len) { be_return_nil(vm); } // any match of empty string returns nil, this catches implicitly when hay_len == 0
hay += offset; // shift to offset
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
// do the match
be_re_match_search_run(vm, code, hay, is_anchored, size_only);
be_os_free(code);
// cleanup
if (be_isstring(vm, 1)) {
be_os_free(code);
}
be_return(vm);
}
be_raise(vm, "type_error", NULL);
@ -134,26 +163,32 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
int be_re_match_search_all(bvm *vm, bbool is_anchored) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int limit = -1;
if (argc >= 3) {
limit = be_toint(vm, 3);
}
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
be_newobject(vm, "list");
@ -165,7 +200,10 @@ int be_re_match_search_all(bvm *vm, bbool is_anchored) {
be_pop(vm, 1);
}
be_pop(vm, 1);
be_os_free(code);
// cleanup
if (be_isstring(vm, 1)) {
be_os_free(code);
}
be_return(vm);
}
be_raise(vm, "type_error", NULL);
@ -329,29 +367,36 @@ int re_pattern_split(bvm *vm) {
// Berry: `re.split(pattern:string, s:string [, split_limit:int]) -> list(string)`
int be_re_split(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int split_limit = -1;
if (argc >= 3) {
split_limit = be_toint(vm, 3);
}
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
int ret = re_pattern_split_run(vm, code, hay, split_limit);
if (be_isstring(vm, 1)) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
ret = re_pattern_split_run(vm, code, hay, split_limit);
be_os_free(code);
return ret;
}
be_raise(vm, "type_error", NULL);
@ -363,6 +408,7 @@ int be_re_split(bvm *vm) {
@const_object_info_begin
module re (scope: global) {
compile, func(be_re_compile)
compilebytes, func(be_re_compilebytes)
search, func(be_re_search)
searchall, func(be_re_search_all)
match, func(be_re_match)

View File

@ -0,0 +1,52 @@
# test regex from re1.5
import re
# standard use of lib
assert(re.search("a.*?b(z+)", "zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
assert(re.searchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
assert(re.match("a.*?b(z+)", "aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
assert(re.match2("a.*?b(z+)", "aaaabbbccbbzzzee") == [14, 'zzz'])
assert(re.matchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>') == [['<abc>', 'abc']])
assert(re.matchall('<([a-zA-Z]+)>', '<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
assert(re.split('/', "foo/bar//baz") == ['foo', 'bar', '', 'baz'])
# pre-compile
var rr
rr = re.compile("a.*?b(z+)")
assert(rr.search("zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
rr = re.compile('<([a-zA-Z]+)>')
assert(rr.searchall('<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
rr = re.compile("a.*?b(z+)")
assert(rr.match("aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
assert(rr.match2("aaaabbbccbbzzzee") == [14, 'zzz'])
rr = re.compile('<([a-zA-Z]+)>')
assert(rr.matchall('<abc> yeah <xyz>') == [['<abc>', 'abc']])
assert(rr.matchall('<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
rr = re.compile('/')
assert(rr.split("foo/bar//baz") == ['foo', 'bar', '', 'baz'])
# compile to bytes
var rb
rb = re.compilebytes("a.*?b(z+)")
assert(re.search(rb, "zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
rb = re.compilebytes('<([a-zA-Z]+)>')
assert(re.searchall(rb, '<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
rb = re.compilebytes("a.*?b(z+)")
assert(re.match(rb, "aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
assert(re.match2(rb, "aaaabbbccbbzzzee") == [14, 'zzz'])
assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
rb = re.compilebytes('<([a-zA-Z]+)>')
assert(re.matchall(rb, '<abc> yeah <xyz>') == [['<abc>', 'abc']])
assert(re.matchall(rb, '<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
rb = re.compilebytes('/')
assert(re.split(rb, "foo/bar//baz") == ['foo', 'bar', '', 'baz'])
assert(rb == bytes('0C000000070000000000000062030260FB7E00012F7E017F'))

View File

@ -238,7 +238,10 @@ class webserver_async
# pre: self.buf_in is not empty
# post: self.buf_in has made progress (smaller or '')
def parse_http_req_line()
var m = global._re_http_srv.match2(self.buf_in, self.buf_in_offset)
import re
# print("parse_http_req_line", "self.buf_in=", self.buf_in)
var m = re.match2(self.server.re_http_srv, self.buf_in, self.buf_in_offset)
# print(f"{m=}")
# Ex: "GET / HTTP/1.1\r\n"
if m
var offset = m[0]
@ -261,16 +264,18 @@ class webserver_async
#############################################################
# parse incoming headers
def parse_http_headers()
import re
while true
# print("parse_http_headers", "self.buf_in_offset=", self.buf_in_offset)
var m = global._re_http_srv_header.match2(self.buf_in, self.buf_in_offset)
# print("m=", m)
var m = re.match2(self.server.re_http_srv_header, self.buf_in, self.buf_in_offset)
# print(f"{m=}")
# Ex: [32, 'Content-Type', 'application/json']
if m
self.event_http_header(m[1], m[2])
self.buf_in_offset += m[0]
else # no more headers
var m2 = global._re_http_srv_body.match2(self.buf_in, self.buf_in_offset)
var m2 = re.match2(self.server.re_http_srv_body, self.buf_in, self.buf_in_offset)
# print(f"{m2=}")
if m2
# end of headers
# we keep \r\n which is used by pattern
@ -519,9 +524,16 @@ class webserver_async
var p1 # temporary object bytes() to avoid reallocation
# static var TIMEOUT = 1000 # default timeout: 1000ms
#############################################################
# pre-compile REGEX
#
# static var HTTP_REQ = "^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n"
# static var HTTP_HEADER_REGEX = "([A-Za-z0-9-]+): (.*?)\r\n" # extract a header with its 2 parts
# static var HTTP_BODY_REGEX = "\r\n" # end of headers
static var re_http_srv = re.compilebytes("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
static var re_http_srv_header = re.compilebytes("([A-Za-z0-9-]+): (.*?)\r\n")
static var re_http_srv_body = re.compilebytes("\r\n")
#############################################################
# init
@ -535,27 +547,12 @@ class webserver_async
self.cors = false
self.p1 = bytes(100) # reserve 100 bytes by default
# TODO what about max_clients ?
self.compile_re()
# register cb
tasmota.add_driver(self)
self.fastloop_cb = def () self.loop() end
tasmota.add_fast_loop(self.fastloop_cb)
end
#############################################################
# compile once for all the regex
def compile_re()
import re
if !global.contains("_re_http_srv")
# global._re_http_srv = re.compile(self.HTTP_REQ)
# global._re_http_srv_header = re.compile(self.HTTP_HEADER_REGEX)
# global._re_http_srv_body = re.compile(self.HTTP_BODY_REGEX)
global._re_http_srv = re.compile("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
global._re_http_srv_header = re.compile("([A-Za-z0-9-]+): (.*?)\r\n")
global._re_http_srv_body = re.compile("\r\n")
end
end
#############################################################
# enable or disable chunked mode (enabled by default)
def set_chunked(chunked)