function sanitize_url($url) {
$strip = [
'fbclid', 'fb_', // Facebook
'ga_*', 'gclid', 'gclsrc', 'gs_l', // google ads
'ref',
'mc_eid',
'igshid',
'twclid',
'msclkid',
'trk', 'trkCampaign', // amazon
'utm_*', 'nr_email_referer', // utm
'itm_*', // itm
'mc_*',
'yclid', '_openstat', // yandex
'sc_campaign', 'sc_channel', 'sc_content', 'sc_medium', 'sc_outcome', 'sc_geo', 'sc_country', // Campaign tracking (sc)
];
$parsed = parse_url($url);
if ($parsed === false OR empty($parsed['query'])) {
return $url;
}
parse_str($parsed['query'], $params);
foreach (array_keys($params) as $key) {
foreach ($strip as $rule) {
if (str_ends_with($rule, '*') && str_starts_with($key, substr($rule, 0, -1))) {
unset($params[$key]);
break;
}
if ($key === $rule) {
unset($params[$key]);
break;
}
}
}
// rebuilt the URL with remaining params
$parsed['query'] = empty($params) ? null : http_build_query($params);
$url = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : '';
$url .= $parsed['host'] ?? '';
$url .= isset($parsed['port']) ? ':' . $parsed['port'] : '';
$url .= $parsed['path'] ?? '';
$url .= isset($parsed['query']) ? '?' . $parsed['query'] : '';
$url .= isset($parsed['fragment']) ? '#' . $parsed['fragment'] : '';
return $url;
}
function html_to_text($html) {
// this function uses both the DOM PHP-lib and plain regexes.
// Replace invisible chars
$fake_spaces = [
"\u{00AD}", "\u{200B}", "\u{200C}", "\u{200D}",
"\u{200E}", "\u{200F}", "\u{FEFF}", "\u{2060}", "\u{034F}"
];
$weird_spaces = [
"\u{2002}", "\u{2003}", "\u{2004}", "\u{2005}",
"\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}",
"\u{200A}", "\u{00A0}", "\u{202F}", "\u{205F}",
"\u{3000}"
];
$html = str_replace($fake_spaces, '', $html);
$html = str_replace($weird_spaces, ' ', $html);
// load HTML for DOM
$dom = new DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
// remove some useless tags
$targets = ['head', 'style', 'script', 'noscript'];
foreach ($targets as $tag) {
$nodes = $dom->getElementsByTagName($tag);
// We loop backwards because the DOM itself changes
for ($i = $nodes->length - 1; $i >= 0; $i--) {
$node = $nodes->item($i);
$node->parentNode->removeChild($node);
}
}
// Remove HTML comments
$xpath = new DOMXPath($dom);
foreach ($xpath->query('//comment()') as $comment) {
$comment->parentNode->removeChild($comment);
}
// Replaces images with their ALT
$images = $dom->getElementsByTagName('img');
// Loop backwards because we are removing/replacing nodes
for ($i = $images->length - 1; $i >= 0; $i--) {
$img = $images->item($i);
$altText = $img->getAttribute('alt');
if (!empty(trim($altText))) {
$replacement = $dom->createTextNode(" [Image: " . trim($altText) . "] ");
} else {
$replacement = $dom->createTextNode(" ");
}
$img->parentNode->replaceChild($replacement, $img);
}
// Grep the links and replace them with [indexes], and refer to them at the bottom of e-mail
$links = [];
$nodes = $dom->getElementsByTagName('a');
// Convert links to array for forward looping
$nodeArray = [];
foreach ($nodes as $node) {
$nodeArray[] = $node;
}
// Loop forwards through the array
foreach ($nodeArray as $node) {
$url = $node->getAttribute('href');
$text = trim($node->textContent);
if ($text === '') {
$node->parentNode->removeChild($node);
continue;
}
// filter out internal links and javascript
if ($url === '' OR str_starts_with($url, '#') OR str_starts_with(strtolower($url), 'javascript:')) {
$newNode = $dom->createTextNode($text);
} else {
if (!isset($links[$url])) {
$links[$url] = count($links) + 1;
}
$newNode = $dom->createTextNode($text . ' [' . $links[$url] . ']');
}
$node->parentNode->replaceChild($newNode, $node);
}
$html = $dom->saveHTML();
// The DOM part is done.
// From now on we use Regex
// Replace existing blockquotes (save them to not flatten them, then replace them back at the end)
$html = preg_replace_callback('/<blockquote\b[^>]*>(.*?)<\/blockquote>/si', function ($m) {
// Convert internal breaks to a unique placeholder we won't lose
$inner = preg_replace('/<(br|p|div)\b[^>]*>/i', " [[LINE_BREAK]]", $m[1]);
$inner = strip_tags($inner);
$inner = html_entity_decode($inner, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$raw_lines = explode("[[LINE_BREAK]]", $inner);
$final_lines = [];
foreach ($raw_lines as $line) {
$line = trim(preg_replace('/ {2,}/', ' ', $line));
if ($line === '') continue;
// Word wrap to 75 chars to allow space for "> "
$wrapped = wordwrap($line, 70, "\n");
$split_wrapped = explode("\n", $wrapped);
foreach ($split_wrapped as $w_line) {
$final_lines[] = "> " . trim($w_line);
}
}
// We use a unique placeholder for the blockquote newlines too
// to protect them from the tag-stripper step below
return "[[BLOCK_START]]" . implode("[[BLOCK_LINE]]", $final_lines) . "[[BLOCK_END]]";
}, $html);
// The textual \n to keep are only those provided with HTML (like P, TABLE, LI, BR…). Remove the other \n
$html = preg_replace('/\r\n|\r|\n/', '', $html);
// remove redundant spaces
$html = preg_replace('/\s{2,}/', '', $html);
// Replace images with their ALT
$html = preg_replace('/<img\b[^>]*?\balt=["\']([^"\']*)["\'][^>]*>/i', ' $1 ', $html);
// Replace any <img> tags that didn’t have an ALT
$html = preg_replace('/<img\b[^>]*>/i', '', $html);
// HTML Block replacement. Replace some with \n\n and some with \n, spaces or nothing
$double = ['p', 'div', 'ul', 'ol', 'dl', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'article', 'section'];
foreach ($double as $tag) {
$html = preg_replace('#</?' . $tag . '\b[^>]*>#i', "\n\n", $html);
}
$single = ['br', 'dt', 'dd', 'tr'];
foreach ($single as $tag) {
$html = preg_replace('#<\/?' . $tag . '\b[^>]*>#i', "\n", $html);
}
$vspace = ['td', 'th'];
foreach ($vspace as $tag) {
$html = preg_replace('#<' . $tag . '\b[^>]*>(.*?)</' . $tag . '>#is', " $1 ", $html);
}
$bullet = ['li', 'dt', 'dd'];
foreach ($bullet as $tag) {
$html = preg_replace('#<' . $tag . '\b[^>]*>#i', "\n— ", $html); // Replace opening tags with a hyphen
$html = preg_replace('#</' . $tag . '>#i', "", $html); // Replace closing tags with nothing (or just a newline)
}
// special case for HR
$html = preg_replace('#<hr\b[^>]*>#i', "\n\n" . str_repeat('-', 40) . "\n\n", $html);
// Final cleanup
$html = strip_tags($html);
// Restore Blockquotes
$html = str_replace("[[BLOCK_START]]", "\n\n", $html);
$html = str_replace("[[BLOCK_LINE]]", "\n", $html);
$html = str_replace("[[BLOCK_END]]", "\n\n", $html);
// Clean residual &entities
$html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// Handles newlines
$lines = explode("\n", $html);
foreach ($lines as $i => $line) {
$line = trim($line);
$lines[$i] = preg_replace('#\s{2,}#', ' ', $line);
}
$text = implode("\n", $lines);
$text = preg_replace('/\n{3,}/', "\n\n", $text);
// Append link references
if (!empty($links)) {
$text .= "\n\n";
// Flip the array so we have [index => url]
$ordered_links = array_flip($links);
ksort($ordered_links); // Ensure [1], [2], [3] order
foreach ($ordered_links as $index => $url) {
$safeUrl = htmlspecialchars($url, ENT_QUOTES, 'UTF-8');
$text .= '['.$index.'] <a href="'.$safeUrl.'">'.$safeUrl.'</a>'."\n";
//$text .= "[$index] ".$url."\n";
}
}
return trim($text);
}