From 50a8202881d2e207ec14dc936c1d91677573cd03 Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Wed, 22 Dec 2021 15:23:50 -0600 Subject: [PATCH 1/3] Add `lol_ok` function to picotest This makes the error checking less verbose, and also prints a much better error if a check fails. --- c-api/tests/src/deps/picotest/picotest.c | 11 +++++++++++ c-api/tests/src/deps/picotest/picotest.h | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/c-api/tests/src/deps/picotest/picotest.c b/c-api/tests/src/deps/picotest/picotest.c index ddf59de8..4822b421 100644 --- a/c-api/tests/src/deps/picotest/picotest.c +++ b/c-api/tests/src/deps/picotest/picotest.c @@ -26,6 +26,7 @@ #include #include #include "picotest.h" +#include "../../../../include/lol_html.h" struct test_t { int num_tests; @@ -75,6 +76,16 @@ void _ok(int cond, const char *fmt, ...) fflush(stdout); } +void _lol_ok(int cond, const char *file, int line) { + cond = !cond; // lol-html returns 0 on success + _ok(cond, "%s %d", file, line); + if (!cond) { + lol_html_str_t err = lol_html_take_last_error(); + assert(err.data != NULL && err.len != 0); + printf("err: last lol_html err: %s", err.data); + } +} + int done_testing(void) { indent(); diff --git a/c-api/tests/src/deps/picotest/picotest.h b/c-api/tests/src/deps/picotest/picotest.h index 63b4d583..3339f48b 100644 --- a/c-api/tests/src/deps/picotest/picotest.h +++ b/c-api/tests/src/deps/picotest/picotest.h @@ -22,13 +22,17 @@ #ifndef picotest_h #define picotest_h +#include + #ifdef __cplusplus extern "C" { #endif void note(const char *fmt, ...) __attribute__((format (printf, 1, 2))); void _ok(int cond, const char *fmt, ...) __attribute__((format (printf, 2, 3))); +void _lol_ok(int cond, const char *file, int line); #define ok(cond) _ok(cond, "%s %d", __FILE__, __LINE__) +#define lol_ok(cond) _lol_ok(cond, __FILE__, __LINE__) int done_testing(void); void subtest(const char *name, void (*cb)(void)); From e9e50850191efaa9b6af42b77fbbdd80ff2d5b9a Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Wed, 22 Dec 2021 15:26:04 -0600 Subject: [PATCH 2/3] Add a C API for `on_end_tag` This does several things: - Expose `Element::on_end_tag` - Expose all the methods of `EndTag` (`name`, `set_name`, `before`, `after`, `remove`) - Add a new `EndTag::set_name_str` function to make sure it uses the proper encoding. Using `set_name` would require blindly assuming that the rewriter always used UTF8. - Add a test to ensure all the new API works correctly --- c-api/include/lol_html.h | 69 ++++++++++++++++++++++++++ c-api/src/element.rs | 65 ++++++++++++++++++++++++ c-api/tests/src/test_element_api.c | 68 +++++++++++++++++++++++++ src/rewritable_units/tokens/end_tag.rs | 6 +++ 4 files changed, 208 insertions(+) diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index 6b0894f5..237b1b92 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -23,6 +23,7 @@ typedef struct lol_html_HtmlRewriterBuilder lol_html_rewriter_builder_t; typedef struct lol_html_HtmlRewriter lol_html_rewriter_t; typedef struct lol_html_Doctype lol_html_doctype_t; typedef struct lol_html_DocumentEnd lol_html_doc_end_t; +typedef struct lol_html_EndTag lol_html_end_tag_t; typedef struct lol_html_Comment lol_html_comment_t; typedef struct lol_html_TextChunk lol_html_text_chunk_t; typedef struct lol_html_Element lol_html_element_t; @@ -110,6 +111,11 @@ typedef lol_html_rewriter_directive_t (*lol_html_doc_end_handler_t)( void *user_data ); +typedef lol_html_rewriter_directive_t (*lol_html_end_tag_handler_t)( + lol_html_end_tag_t *end_tag, + void *user_data +); + // Selector //--------------------------------------------------------------------- @@ -671,6 +677,69 @@ void lol_html_element_user_data_set( // Returns user data attached to the text chunk. void *lol_html_element_user_data_get(const lol_html_element_t *element); +// Adds content handlers to the builder for the end tag of the given element. +// +// Subsequent calls to the method on the same element replace the previous handler. +// +// The handler can optionally have associated user data which will be +// passed to the handler on each invocation along with the rewritable +// unit argument. +// +// If the handler returns LOL_HTML_STOP directive then rewriting +// stops immediately and `write()` or `end()` of the rewriter methods +// return an error code. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: Pointers passed to handlers are valid only during the +// handler execution. So they should never be leaked outside of handlers. +int lol_html_element_on_end_tag(lol_html_element_t* element, lol_html_end_tag_handler_t end_tag_handler, void* user_data); + +// Inserts the content string before the element's end tag either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_end_tag_before( + lol_html_end_tag_t *end_tag, + const char *content, + size_t content_len, + bool is_html +); + +// Inserts the content string right after the element's end tag as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_end_tag_after( + lol_html_end_tag_t *end_tag, + const char *content, + size_t content_len, + bool is_html +); + +// Removes the end tag. +void lol_html_end_tag_remove(lol_html_end_tag_t *end_tag); + +// Returns the end tag name. +lol_html_str_t lol_html_end_tag_name_get(const lol_html_end_tag_t *end_tag); + +// Sets the tag name of the end tag. +// +// Name should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_end_tag_name_set( + lol_html_end_tag_t *end_tag, + const char *name, + size_t name_len +); + // Inserts the content at the end of the document, either as raw text or as HTML. // // The content should be a valid UTF-8 string. diff --git a/c-api/src/element.rs b/c-api/src/element.rs index 178f35e8..57369172 100644 --- a/c-api/src/element.rs +++ b/c-api/src/element.rs @@ -217,3 +217,68 @@ pub extern "C" fn lol_html_element_user_data_set(element: *mut Element, user_dat pub extern "C" fn lol_html_element_user_data_get(element: *mut Element) -> *mut c_void { get_user_data!(element) } + +type EndTagHandler = unsafe extern "C" fn(*mut EndTag, *mut c_void) -> RewriterDirective; + +#[no_mangle] +pub extern "C" fn lol_html_element_on_end_tag( + element: *mut Element, + handler: EndTagHandler, + user_data: *mut c_void, +) -> c_int { + let element = to_ref_mut!(element); + // TODO: should this pass `user_data`? The problem is it will be using an old version of the + // user data set before the rewriter started running ... maybe we should just let the callback + // call `user_data_get` if it wants to associate data with this? + // let user_data = element.user_data().downcase_ref::<*mut c_void>().unwrap_or(ptr::null_mut()); + let () = unwrap_or_ret_err_code!(element.on_end_tag(move |end_tag| { + match unsafe { handler(end_tag, user_data) } { + RewriterDirective::Continue => Ok(()), + RewriterDirective::Stop => Err("The rewriter has been stopped.".into()), + } + })); + 0 +} + +#[no_mangle] +pub extern "C" fn lol_html_end_tag_before( + end_tag: *mut EndTag, + content: *const c_char, + content_len: size_t, + is_html: bool, +) -> c_int { + content_insertion_fn_body! { end_tag.before(content, content_len, is_html) } +} + +#[no_mangle] +pub extern "C" fn lol_html_end_tag_after( + end_tag: *mut EndTag, + content: *const c_char, + content_len: size_t, + is_html: bool, +) -> c_int { + content_insertion_fn_body! { end_tag.after(content, content_len, is_html) } +} + +#[no_mangle] +pub extern "C" fn lol_html_end_tag_remove(end_tag: *mut EndTag) { + to_ref_mut!(end_tag).remove(); +} + +#[no_mangle] +pub extern "C" fn lol_html_end_tag_name_get(end_tag: *mut EndTag) -> Str { + let tag = to_ref_mut!(end_tag); + Str::new(tag.name()) +} + +#[no_mangle] +pub extern "C" fn lol_html_end_tag_name_set( + end_tag: *mut EndTag, + name: *const c_char, + len: size_t, +) -> c_int { + let tag = to_ref_mut!(end_tag); + let name = unwrap_or_ret_err_code! { to_str!(name, len) }; + tag.set_name_str(name); + 0 +} diff --git a/c-api/tests/src/test_element_api.c b/c-api/tests/src/test_element_api.c index 739dd6bb..e6e34a75 100644 --- a/c-api/tests/src/test_element_api.c +++ b/c-api/tests/src/test_element_api.c @@ -617,6 +617,48 @@ static void test_stop(lol_html_selector_t *selector, void *user_data) { ok(!err); } +static lol_html_rewriter_directive_t modify_element_end_tag_name_inner(lol_html_end_tag_t *end_tag, void *user_data) { + int times_run = *(int*)user_data; + + if (times_run == 0) { + lol_ok(lol_html_end_tag_before(end_tag, "!", 1, false)); + const char *after_html = "extra data"; + lol_ok(lol_html_end_tag_after(end_tag, after_html, strlen(after_html), true)); + + lol_html_str_t name = lol_html_end_tag_name_get(end_tag); + str_eq(name, "div"); + + lol_ok(lol_html_end_tag_name_set(end_tag, "div1", strlen("div1"))); + name = lol_html_end_tag_name_get(end_tag); + str_eq(name, "div1"); + } else { + lol_html_end_tag_remove(end_tag); + } + + return LOL_HTML_CONTINUE; +} + +static lol_html_rewriter_directive_t modify_element_end_tag_name_outer( + lol_html_element_t *element, + void *user_data +) { + UNUSED(user_data); + + static int times_run = -1; // so that it will be 0 on the first call to `inner` + + lol_ok(lol_html_element_on_end_tag(element, modify_element_end_tag_name_inner, ×_run)); + times_run += 1; + + return LOL_HTML_CONTINUE; +} + +EXPECT_OUTPUT( + modify_element_end_tag, + "
42!extra data
some data", + &EXPECTED_USER_DATA, + sizeof(EXPECTED_USER_DATA) +); + void element_api_test() { int user_data = 43; @@ -718,4 +760,30 @@ void element_api_test() { lol_html_selector_free(selector); } + + { + note("EndTagChange"); + + const char *selector_str = "div"; + lol_html_selector_t *selector = lol_html_selector_parse( + selector_str, + strlen(selector_str) + ); + + lol_html_rewriter_builder_t *builder = lol_html_rewriter_builder_new(); + + lol_ok(lol_html_rewriter_builder_add_element_content_handlers( + builder, + selector, + modify_element_end_tag_name_outer, + NULL, + NULL, + NULL, + NULL, + NULL + )); + + const char *input = "
42
some data
"; + run_rewriter(builder, input, modify_element_end_tag, &user_data); + } } diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index c2ac2248..94f5ad69 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -36,6 +36,12 @@ impl<'i> EndTag<'i> { self.raw = None; } + #[inline] + // FIXME(encoding_rs#77): take an owned value for `name` to avoid an extra copy. + pub fn set_name_str(&mut self, name: &str) { + self.set_name(Bytes::from_str(name, self.encoding).into_owned()) + } + #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { self.mutations.before(content, content_type); From cda0d59205a1e36ae7ea1caecca92d1f1e5f3c4c Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Fri, 24 Dec 2021 09:58:29 -0600 Subject: [PATCH 3/3] Take an owned value in `EndTag::set_name_str` This avoids an unnecessary copy if you already have an owned String. --- c-api/src/element.rs | 6 +----- src/base/bytes.rs | 9 +++++++++ src/rewritable_units/tokens/end_tag.rs | 5 ++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/c-api/src/element.rs b/c-api/src/element.rs index 57369172..c0bc0936 100644 --- a/c-api/src/element.rs +++ b/c-api/src/element.rs @@ -227,10 +227,6 @@ pub extern "C" fn lol_html_element_on_end_tag( user_data: *mut c_void, ) -> c_int { let element = to_ref_mut!(element); - // TODO: should this pass `user_data`? The problem is it will be using an old version of the - // user data set before the rewriter started running ... maybe we should just let the callback - // call `user_data_get` if it wants to associate data with this? - // let user_data = element.user_data().downcase_ref::<*mut c_void>().unwrap_or(ptr::null_mut()); let () = unwrap_or_ret_err_code!(element.on_end_tag(move |end_tag| { match unsafe { handler(end_tag, user_data) } { RewriterDirective::Continue => Ok(()), @@ -279,6 +275,6 @@ pub extern "C" fn lol_html_end_tag_name_set( ) -> c_int { let tag = to_ref_mut!(end_tag); let name = unwrap_or_ret_err_code! { to_str!(name, len) }; - tag.set_name_str(name); + tag.set_name_str(name.to_string()); 0 } diff --git a/src/base/bytes.rs b/src/base/bytes.rs index 5f232654..828811f2 100644 --- a/src/base/bytes.rs +++ b/src/base/bytes.rs @@ -20,6 +20,15 @@ impl<'b> Bytes<'b> { encoding.encode(string).0.into() } + /// Same as `Bytes::from_str(&string).into_owned()`, but avoids copying in the common case where + /// the output and input encodings are the same. + pub fn from_string(string: String, encoding: &'static Encoding) -> Bytes<'static> { + Bytes(Cow::Owned(match encoding.encode(&string).0 { + Cow::Owned(bytes) => bytes, + Cow::Borrowed(_) => string.into_bytes(), + })) + } + #[inline] pub fn from_str_without_replacements( string: &'b str, diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 94f5ad69..54577de1 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -37,9 +37,8 @@ impl<'i> EndTag<'i> { } #[inline] - // FIXME(encoding_rs#77): take an owned value for `name` to avoid an extra copy. - pub fn set_name_str(&mut self, name: &str) { - self.set_name(Bytes::from_str(name, self.encoding).into_owned()) + pub fn set_name_str(&mut self, name: String) { + self.set_name(Bytes::from_string(name, self.encoding)) } #[inline]