Berry now accepts 'bytes()' as precompiled patterns, added 're.compilebytes()' (#23149)

This commit is contained in:
s-hadinger
2025-03-13 23:38:55 +01:00
committed by GitHub
parent 6486ba3b26
commit 1b51aef911
5 changed files with 616 additions and 574 deletions

View File

@@ -64,6 +64,25 @@ int be_re_compile(bvm *vm) {
be_raise(vm, "type_error", NULL);
}
// Native functions be_const_func()
// Berry: `re.compilebytes(pattern:string) -> instance(bytes)`
int be_re_compilebytes(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 1 && be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
be_pushbytes(vm, NULL, sizeof(ByteProg) + sz);
ByteProg *code = (ByteProg*) be_tobytes(vm, -1, NULL);
re1_5_compilecode(code, regex_str);
be_return(vm);
}
be_raise(vm, "type_error", NULL);
}
// pushes either a list if matched, else `nil`
// return index of next offset, or -1 if not found
const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored, bbool size_only) {
@@ -99,9 +118,10 @@ const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbo
int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int32_t offset = 0;
if (argc >= 3 && be_isint(vm, 3)) {
offset = be_toint(vm, 3);
@@ -111,22 +131,31 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
if (offset >= hay_len) { be_return_nil(vm); } // any match of empty string returns nil, this catches implicitly when hay_len == 0
hay += offset; // shift to offset
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
// do the match
be_re_match_search_run(vm, code, hay, is_anchored, size_only);
be_os_free(code);
// cleanup
if (be_isstring(vm, 1)) {
be_os_free(code);
}
be_return(vm);
}
be_raise(vm, "type_error", NULL);
@@ -134,26 +163,32 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
int be_re_match_search_all(bvm *vm, bbool is_anchored) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int limit = -1;
if (argc >= 3) {
limit = be_toint(vm, 3);
}
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
be_newobject(vm, "list");
@@ -165,7 +200,10 @@ int be_re_match_search_all(bvm *vm, bbool is_anchored) {
be_pop(vm, 1);
}
be_pop(vm, 1);
be_os_free(code);
// cleanup
if (be_isstring(vm, 1)) {
be_os_free(code);
}
be_return(vm);
}
be_raise(vm, "type_error", NULL);
@@ -329,29 +367,36 @@ int re_pattern_split(bvm *vm) {
// Berry: `re.split(pattern:string, s:string [, split_limit:int]) -> list(string)`
int be_re_split(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
ByteProg *code = NULL;
int split_limit = -1;
if (argc >= 3) {
split_limit = be_toint(vm, 3);
}
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
if (be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
code = be_os_malloc(sizeof(ByteProg) + sz);
if (code == NULL) {
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
} else {
code = (ByteProg *) be_tobytes(vm, 1, NULL);
}
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
int ret = re_pattern_split_run(vm, code, hay, split_limit);
if (be_isstring(vm, 1)) {
be_os_free(code);
be_raise(vm, "internal_error", "error in regex");
}
ret = re_pattern_split_run(vm, code, hay, split_limit);
be_os_free(code);
return ret;
}
be_raise(vm, "type_error", NULL);
@@ -363,6 +408,7 @@ int be_re_split(bvm *vm) {
@const_object_info_begin
module re (scope: global) {
compile, func(be_re_compile)
compilebytes, func(be_re_compilebytes)
search, func(be_re_search)
searchall, func(be_re_search_all)
match, func(be_re_match)