From 485099fd6e9e88fdc7521fff7e54a6d7023e9871 Mon Sep 17 00:00:00 2001 From: Ian Henry Date: Sat, 22 Apr 2023 23:40:32 -0700 Subject: [PATCH 1/2] string and peg replacement functions can now take functions Functions will be invoked with the matched text, and their result will be coerced to a string and used as the new replacement text. This also allows passing non-function, non-byteviewable values, which will be converted into strings during replacement (only once, and only if at least one match is found). --- src/core/peg.c | 19 +++++++++++++------ src/core/string.c | 22 +++++++++++++--------- src/core/util.c | 40 ++++++++++++++++++++++++++++++++++++++++ src/core/util.h | 1 + test/suite0002.janet | 4 ++++ test/suite0008.janet | 9 ++++++++- 6 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/core/peg.c b/src/core/peg.c index b7de920d..3d035533 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -1637,7 +1637,7 @@ typedef struct { JanetPeg *peg; PegState s; JanetByteView bytes; - JanetByteView repl; + Janet subst; int32_t start; } PegCall; @@ -1653,7 +1653,7 @@ static PegCall peg_cfun_init(int32_t argc, Janet *argv, int get_replace) { ret.peg = compile_peg(argv[0]); } if (get_replace) { - ret.repl = janet_getbytes(argv, 1); + ret.subst = argv[1]; ret.bytes = janet_getbytes(argv, 2); } else { ret.bytes = janet_getbytes(argv, 1); @@ -1738,7 +1738,8 @@ static Janet cfun_peg_replace_generic(int32_t argc, Janet *argv, int only_one) { trail = i; } int32_t nexti = (int32_t)(result - c.bytes.bytes); - janet_buffer_push_bytes(ret, c.repl.bytes, c.repl.len); + JanetByteView subst = janet_text_substitution(&c.subst, c.bytes.bytes + i, nexti - i); + janet_buffer_push_bytes(ret, subst.bytes, subst.len); trail = nexti; if (nexti == i) nexti++; i = nexti; @@ -1754,14 +1755,20 @@ static Janet cfun_peg_replace_generic(int32_t argc, Janet *argv, int only_one) { } JANET_CORE_FN(cfun_peg_replace_all, - "(peg/replace-all peg repl text &opt start & args)", - "Replace all matches of peg in text with repl, returning a new buffer. The peg does not need to make captures to do replacement.") { + "(peg/replace-all peg subst text &opt start & args)", + "Replace all matches of `peg` in `text` with `subst`, returning a new buffer. " + "The peg does not need to make captures to do replacement. " + "If `subst` is a function, it will be called once for each match " + "and should return the actual replacement text to use.") { return cfun_peg_replace_generic(argc, argv, 0); } JANET_CORE_FN(cfun_peg_replace, "(peg/replace peg repl text &opt start & args)", - "Replace first match of peg in text with repl, returning a new buffer. The peg does not need to make captures to do replacement. " + "Replace first match of `peg` in `text` with `subst`, returning a new buffer. " + "The peg does not need to make captures to do replacement. " + "If `subst` is a function, it will be called with the matching text, " + "and should return the actual replacement text to use. " "If no matches are found, returns the input string in a new buffer.") { return cfun_peg_replace_generic(argc, argv, 1); } diff --git a/src/core/string.c b/src/core/string.c index 1e1d9622..f898bb94 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -364,14 +364,13 @@ JANET_CORE_FN(cfun_string_findall, struct replace_state { struct kmp_state kmp; - const uint8_t *subst; - int32_t substlen; + Janet subst; }; static void replacesetup(int32_t argc, Janet *argv, struct replace_state *s) { janet_arity(argc, 3, 4); JanetByteView pat = janet_getbytes(argv, 0); - JanetByteView subst = janet_getbytes(argv, 1); + Janet subst = argv[1]; JanetByteView text = janet_getbytes(argv, 2); int32_t start = 0; if (argc == 4) { @@ -380,13 +379,14 @@ static void replacesetup(int32_t argc, Janet *argv, struct replace_state *s) { } kmp_init(&s->kmp, text.bytes, text.len, pat.bytes, pat.len); s->kmp.i = start; - s->subst = subst.bytes; - s->substlen = subst.len; + s->subst = subst; } JANET_CORE_FN(cfun_string_replace, "(string/replace patt subst str)", "Replace the first occurrence of `patt` with `subst` in the string `str`. " + "If `subst` is a function, it will be called with `patt` only if a match is found, " + "and should return the actual replacement text to use. " "Will return the new string if `patt` is found, otherwise returns `str`.") { int32_t result; struct replace_state s; @@ -397,10 +397,11 @@ JANET_CORE_FN(cfun_string_replace, kmp_deinit(&s.kmp); return janet_stringv(s.kmp.text, s.kmp.textlen); } - buf = janet_string_begin(s.kmp.textlen - s.kmp.patlen + s.substlen); + JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen); + buf = janet_string_begin(s.kmp.textlen - s.kmp.patlen + subst.len); safe_memcpy(buf, s.kmp.text, result); - safe_memcpy(buf + result, s.subst, s.substlen); - safe_memcpy(buf + result + s.substlen, + safe_memcpy(buf + result, subst.bytes, subst.len); + safe_memcpy(buf + result + subst.len, s.kmp.text + result + s.kmp.patlen, s.kmp.textlen - result - s.kmp.patlen); kmp_deinit(&s.kmp); @@ -411,6 +412,8 @@ JANET_CORE_FN(cfun_string_replaceall, "(string/replace-all patt subst str)", "Replace all instances of `patt` with `subst` in the string `str`. Overlapping " "matches will not be counted, only the first match in such a span will be replaced. " + "If `subst` is a function, it will be called with `patt` once for each match, " + "and should return the actual replacement text to use. " "Will return the new string if `patt` is found, otherwise returns `str`.") { int32_t result; struct replace_state s; @@ -419,8 +422,9 @@ JANET_CORE_FN(cfun_string_replaceall, replacesetup(argc, argv, &s); janet_buffer_init(&b, s.kmp.textlen); while ((result = kmp_next(&s.kmp)) >= 0) { + JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen); janet_buffer_push_bytes(&b, s.kmp.text + lastindex, result - lastindex); - janet_buffer_push_bytes(&b, s.subst, s.substlen); + janet_buffer_push_bytes(&b, subst.bytes, subst.len); lastindex = result + s.kmp.patlen; kmp_seti(&s.kmp, lastindex); } diff --git a/src/core/util.c b/src/core/util.c index a9395545..ddbb4515 100644 --- a/src/core/util.c +++ b/src/core/util.c @@ -663,6 +663,46 @@ JanetBinding janet_binding_from_entry(Janet entry) { return binding; } +/* If the value at the given address can be coerced to a byte view, + return that byte view. If it can't, replace the value at the address + with the result of janet_to_string, and return a byte view over that + string. */ +static JanetByteView memoize_byte_view(Janet *value) { + JanetByteView result; + if (!janet_bytes_view(*value, &result.bytes, &result.len)) { + JanetString str = janet_to_string(*value); + *value = janet_wrap_string(str); + result.bytes = str; + result.len = janet_string_length(str); + } + return result; +} + +static JanetByteView to_byte_view(Janet value) { + JanetByteView result; + if (!janet_bytes_view(value, &result.bytes, &result.len)) { + JanetString str = janet_to_string(value); + result.bytes = str; + result.len = janet_string_length(str); + } + return result; +} + +JanetByteView janet_text_substitution(Janet *subst, const uint8_t *bytes, uint32_t len) { + switch (janet_type(*subst)) { + case JANET_CFUNCTION: { + Janet matched = janet_stringv(bytes, len); + return to_byte_view(janet_unwrap_cfunction(*subst)(1, &matched)); + } + case JANET_FUNCTION: { + Janet matched = janet_stringv(bytes, len); + return to_byte_view(janet_call(janet_unwrap_function(*subst), 1, &matched)); + } + default: + return memoize_byte_view(subst); + } +} + JanetBinding janet_resolve_ext(JanetTable *env, const uint8_t *sym) { Janet entry = janet_table_get(env, janet_wrap_symbol(sym)); return janet_binding_from_entry(entry); diff --git a/src/core/util.h b/src/core/util.h index 5a4c8808..2eaf003a 100644 --- a/src/core/util.h +++ b/src/core/util.h @@ -93,6 +93,7 @@ void janet_buffer_format( Janet *argv); Janet janet_next_impl(Janet ds, Janet key, int is_interpreter); JanetBinding janet_binding_from_entry(Janet entry); +JanetByteView janet_text_substitution(Janet *subst, const uint8_t *bytes, uint32_t len); /* Registry functions */ void janet_registry_put( diff --git a/test/suite0002.janet b/test/suite0002.janet index f971df1a..bb249298 100644 --- a/test/suite0002.janet +++ b/test/suite0002.janet @@ -72,6 +72,10 @@ (assert (= (string/replace "X" "." "XXX...XXX...XXX") ".XX...XXX...XXX") "string/replace 1") (assert (= (string/replace-all "X" "." "XXX...XXX...XXX") "...............") "string/replace-all 1") (assert (= (string/replace-all "XX" "." "XXX...XXX...XXX") ".X....X....X") "string/replace-all 2") +(assert (= (string/replace "xx" string/ascii-upper "xxyxyxyxxxy") "XXyxyxyxxxy") "string/replace function") +(assert (= (string/replace-all "xx" string/ascii-upper "xxyxyxyxxxy") "XXyxyxyXXxy") "string/replace-all function") +(assert (= (string/replace "x" 12 "xyx") "12yx") "string/replace stringable") +(assert (= (string/replace-all "x" 12 "xyx") "12y12") "string/replace-all stringable") (assert (= (string/ascii-lower "ABCabc&^%!@:;.") "abcabc&^%!@:;.") "string/ascii-lower") (assert (= (string/ascii-upper "ABCabc&^%!@:;.") "ABCABC&^%!@:;.") "string/ascii-lower") (assert (= (string/reverse "") "") "string/reverse 1") diff --git a/test/suite0008.janet b/test/suite0008.janet index 8c65027c..457b69ae 100644 --- a/test/suite0008.janet +++ b/test/suite0008.janet @@ -330,7 +330,6 @@ neldb\0\0\0\xD8\x05printG\x01\0\xDE\xDE\xDE'\x03\0marshal_tes/\x02 (assert (deep= (peg/find-all '"/" p) @[0 4 10 14]) "peg find-all") # Peg replace and replace-all -(var ti 0) (defn check-replacer [x y z] (assert (= (string/replace x y z) (string (peg/replace x y z))) "replacer test replace") @@ -339,6 +338,14 @@ neldb\0\0\0\xD8\x05printG\x01\0\xDE\xDE\xDE'\x03\0marshal_tes/\x02 (check-replacer "abc" "Z" "") (check-replacer "aba" "ZZZZZZ" "ababababababa") (check-replacer "aba" "" "ababababababa") +(check-replacer "aba" string/ascii-upper "ababababababa") +(check-replacer "aba" 123 "ababababababa") +(assert (= (string (peg/replace-all ~(set "ab") string/ascii-upper "abcaa")) + "ABcAA") + "peg/replace-all cfunction") +(assert (= (string (peg/replace-all ~(set "ab") |$ "abcaa")) + "abcaa") + "peg/replace-all function") # Peg bug (assert (deep= @[] (peg/match '(any 1) @"")) "peg empty pattern 1") From 9dc7e8ed3afcc9aa885117f67db22790025f2752 Mon Sep 17 00:00:00 2001 From: Ian Henry Date: Sun, 23 Apr 2023 09:09:14 -0700 Subject: [PATCH 2/2] peg replacement functions have access to captures When peg/replace or peg/replace-all are given a function to serve as the text replacement, any captures produced by the PEG are passed as additional arguments to that function. --- src/core/peg.c | 10 +++++----- src/core/string.c | 4 ++-- src/core/util.c | 39 ++++++++++++++++++++++++++------------- src/core/util.h | 6 +++++- test/suite0008.janet | 18 ++++++++++++++++++ 5 files changed, 56 insertions(+), 21 deletions(-) diff --git a/src/core/peg.c b/src/core/peg.c index 3d035533..5057494a 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -1738,7 +1738,7 @@ static Janet cfun_peg_replace_generic(int32_t argc, Janet *argv, int only_one) { trail = i; } int32_t nexti = (int32_t)(result - c.bytes.bytes); - JanetByteView subst = janet_text_substitution(&c.subst, c.bytes.bytes + i, nexti - i); + JanetByteView subst = janet_text_substitution(&c.subst, c.bytes.bytes + i, nexti - i, c.s.captures); janet_buffer_push_bytes(ret, subst.bytes, subst.len); trail = nexti; if (nexti == i) nexti++; @@ -1758,8 +1758,8 @@ JANET_CORE_FN(cfun_peg_replace_all, "(peg/replace-all peg subst text &opt start & args)", "Replace all matches of `peg` in `text` with `subst`, returning a new buffer. " "The peg does not need to make captures to do replacement. " - "If `subst` is a function, it will be called once for each match " - "and should return the actual replacement text to use.") { + "If `subst` is a function, it will be called with the " + "matching text followed by any captures.") { return cfun_peg_replace_generic(argc, argv, 0); } @@ -1767,8 +1767,8 @@ JANET_CORE_FN(cfun_peg_replace, "(peg/replace peg repl text &opt start & args)", "Replace first match of `peg` in `text` with `subst`, returning a new buffer. " "The peg does not need to make captures to do replacement. " - "If `subst` is a function, it will be called with the matching text, " - "and should return the actual replacement text to use. " + "If `subst` is a function, it will be called with the " + "matching text followed by any captures. " "If no matches are found, returns the input string in a new buffer.") { return cfun_peg_replace_generic(argc, argv, 1); } diff --git a/src/core/string.c b/src/core/string.c index f898bb94..e7957edf 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -397,7 +397,7 @@ JANET_CORE_FN(cfun_string_replace, kmp_deinit(&s.kmp); return janet_stringv(s.kmp.text, s.kmp.textlen); } - JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen); + JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen, NULL); buf = janet_string_begin(s.kmp.textlen - s.kmp.patlen + subst.len); safe_memcpy(buf, s.kmp.text, result); safe_memcpy(buf + result, subst.bytes, subst.len); @@ -422,7 +422,7 @@ JANET_CORE_FN(cfun_string_replaceall, replacesetup(argc, argv, &s); janet_buffer_init(&b, s.kmp.textlen); while ((result = kmp_next(&s.kmp)) >= 0) { - JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen); + JanetByteView subst = janet_text_substitution(&s.subst, s.kmp.text + result, s.kmp.patlen, NULL); janet_buffer_push_bytes(&b, s.kmp.text + lastindex, result - lastindex); janet_buffer_push_bytes(&b, subst.bytes, subst.len); lastindex = result + s.kmp.patlen; diff --git a/src/core/util.c b/src/core/util.c index ddbb4515..3c50bc94 100644 --- a/src/core/util.c +++ b/src/core/util.c @@ -688,19 +688,32 @@ static JanetByteView to_byte_view(Janet value) { return result; } -JanetByteView janet_text_substitution(Janet *subst, const uint8_t *bytes, uint32_t len) { - switch (janet_type(*subst)) { - case JANET_CFUNCTION: { - Janet matched = janet_stringv(bytes, len); - return to_byte_view(janet_unwrap_cfunction(*subst)(1, &matched)); - } - case JANET_FUNCTION: { - Janet matched = janet_stringv(bytes, len); - return to_byte_view(janet_call(janet_unwrap_function(*subst), 1, &matched)); - } - default: - return memoize_byte_view(subst); - } +JanetByteView janet_text_substitution( + Janet *subst, + const uint8_t *bytes, + uint32_t len, + JanetArray *extra_argv) { + int32_t extra_argc = extra_argv == NULL ? 0 : extra_argv->count; + JanetType type = janet_type(*subst); + switch (type) { + case JANET_FUNCTION: + case JANET_CFUNCTION: { + int32_t argc = 1 + extra_argc; + Janet *argv = janet_tuple_begin(argc); + argv[0] = janet_stringv(bytes, len); + for (int32_t i = 0; i < extra_argc; i++) { + argv[i + 1] = extra_argv->data[i]; + } + janet_tuple_end(argv); + if (type == JANET_FUNCTION) { + return to_byte_view(janet_call(janet_unwrap_function(*subst), argc, argv)); + } else { + return to_byte_view(janet_unwrap_cfunction(*subst)(argc, argv)); + } + } + default: + return memoize_byte_view(subst); + } } JanetBinding janet_resolve_ext(JanetTable *env, const uint8_t *sym) { diff --git a/src/core/util.h b/src/core/util.h index 2eaf003a..b8f9cc90 100644 --- a/src/core/util.h +++ b/src/core/util.h @@ -93,7 +93,11 @@ void janet_buffer_format( Janet *argv); Janet janet_next_impl(Janet ds, Janet key, int is_interpreter); JanetBinding janet_binding_from_entry(Janet entry); -JanetByteView janet_text_substitution(Janet *subst, const uint8_t *bytes, uint32_t len); +JanetByteView janet_text_substitution( + Janet *subst, + const uint8_t *bytes, + uint32_t len, + JanetArray *extra_args); /* Registry functions */ void janet_registry_put( diff --git a/test/suite0008.janet b/test/suite0008.janet index 457b69ae..1bec7190 100644 --- a/test/suite0008.janet +++ b/test/suite0008.janet @@ -340,6 +340,7 @@ neldb\0\0\0\xD8\x05printG\x01\0\xDE\xDE\xDE'\x03\0marshal_tes/\x02 (check-replacer "aba" "" "ababababababa") (check-replacer "aba" string/ascii-upper "ababababababa") (check-replacer "aba" 123 "ababababababa") + (assert (= (string (peg/replace-all ~(set "ab") string/ascii-upper "abcaa")) "ABcAA") "peg/replace-all cfunction") @@ -347,6 +348,23 @@ neldb\0\0\0\xD8\x05printG\x01\0\xDE\xDE\xDE'\x03\0marshal_tes/\x02 "abcaa") "peg/replace-all function") +(defn peg-test [name f peg subst text expected] + (assert (= (string (f peg subst text)) expected) name)) + +(peg-test "peg/replace has access to captures" + peg/replace + ~(sequence "." (capture (set "ab"))) + (fn [str char] (string/format "%s -> %s, " str (string/ascii-upper char))) + ".a.b.c" + ".a -> A, .b.c") + +(peg-test "peg/replace-all has access to captures" + peg/replace-all + ~(sequence "." (capture (set "ab"))) + (fn [str char] (string/format "%s -> %s, " str (string/ascii-upper char))) + ".a.b.c" + ".a -> A, .b -> B, .c") + # Peg bug (assert (deep= @[] (peg/match '(any 1) @"")) "peg empty pattern 1") (assert (deep= @[] (peg/match '(any 1) (buffer))) "peg empty pattern 2")