diff --git a/src/deep_search/commonutils.cpp b/src/deep_search/commonutils.cpp index e8e1e4b590..baaa16def3 100644 --- a/src/deep_search/commonutils.cpp +++ b/src/deep_search/commonutils.cpp @@ -26,6 +26,63 @@ #include "util/rsthreads.h" #include "util/rsdebuglevel0.h" +namespace DeepSearch +{ + +std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc) +{ + if(rsHtmlDoc.empty()) return rsHtmlDoc; + + const bool isPlainMsg = + rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>'; + if(isPlainMsg) return rsHtmlDoc; + + auto oSize = rsHtmlDoc.size(); + auto bodyTagBegin(rsHtmlDoc.find("
= oSize) return rsHtmlDoc; + + auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin)); + if(bodyTagEnd >= oSize) return rsHtmlDoc; + + std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1)); + + // strip also CSS inside + oSize = retVal.size(); + auto styleTagBegin(retVal.find(" - oSize = retVal.size(); - auto styleTagBegin(retVal.find("