The code snippet included in this document will remove the high-value ASCII and UniCode characters that can cause problem with Web pages. This is often caused when a person copies and paste from Microsoft Word or other word-processing applications.
The code sample is intended to be a CFC function, but can be adapted as necessary:
<cffunction name="DeMoronize" access="public" output="no" returntype="string" hint="Fixes text using Microsoft Latin-1
Extensions, namely ASCII characters 128-160 and UniCode characters">
<cfargument name="inputString" type="string" required="yes" hint="String to DeMoronize">
<cfscript>
var i = 0;
var rt = trim(arguments.inputString);
// map incompatible non-ISO characters into plausible
// substitutes
rt = Replace(rt, Chr(128), "€", "All");
rt = Replace(rt, Chr(8364), "€", "All");
rt = Replace(rt, Chr(130), ",", "All");
rt = Replace(rt, Chr(8218), ",", "All");
rt = Replace(rt, Chr(131), "<em>f</em>", "All");
rt = Replace(rt, Chr(402), "<em>f</em>", "All");
rt = Replace(rt, Chr(132), ",,", "All");
rt = Replace(rt, Chr(8222), ",,", "All");
rt = Replace(rt, Chr(133), "...", "All");
rt = Replace(rt, Chr(8230), "...", "All");
rt = Replace(rt, Chr(136), "^", "All");
rt = Replace(rt, Chr(710), "^", "All");
rt = Replace(rt, Chr(139), ")", "All");
rt = Replace(rt, Chr(8249), ")", "All");
rt = Replace(rt, Chr(140), "Oe", "All");
rt = Replace(rt, Chr(338), "Oe", "All");
rt = Replace(rt, Chr(145), "`", "All");
rt = Replace(rt, Chr(8216), "`", "All");
rt = Replace(rt, Chr(146), "'", "All");
rt = Replace(rt, Chr(8217), "'", "All");
rt = Replace(rt, Chr(147), """", "All");
rt = Replace(rt, Chr(8220), """", "All");
rt = Replace(rt, Chr(148), """", "All");
rt = Replace(rt, Chr(8221), """", "All");
rt = Replace(rt, Chr(149), "*", "All");
rt = Replace(rt, Chr(8226), "*", "All");
rt = Replace(rt, Chr(150), "-", "All");
rt = Replace(rt, Chr(8211), "-", "All");
rt = Replace(rt, Chr(151), "--", "All");
rt = Replace(rt, Chr(8212), "--", "All");
rt = Replace(rt, Chr(152), "~", "All");
rt = Replace(rt, Chr(732), "~", "All");
rt = Replace(rt, Chr(153), "™", "All");
rt = Replace(rt, Chr(8482), "™", "All");
rt = Replace(rt, Chr(155), ")", "All");
rt = Replace(rt, Chr(8250), ")", "All");
rt = Replace(rt, Chr(156), "oe", "All");
rt = Replace(rt, Chr(339), "oe", "All");
// remove any remaining ASCII 128-159 characters
for (i = 128; i LTE 159; i = i + 1)
rt = Replace(rt, Chr(i), "", "All");
// map Latin-1 supplemental characters into
// their &name; encoded substitutes
rt = Replace(rt, Chr(160), " ", "All");
rt = Replace(rt, Chr(163), "£", "All");
rt = Replace(rt, Chr(169), "©", "All");
rt = Replace(rt, Chr(176), "°", "All");
// encode ASCII 160-255 using 'square' format
for (i = 160; i LTE 255; i = i + 1)
rt = REReplace(rt, "(#Chr(i)#)", "&###i#;", "All");
// supply missing semicolon at end of numeric entities
rt = ReReplace(rt, "&##([0-2][[:digit:]]{2})([^;])", "&##\1;\2", "All");
// fix obscure numeric rendering of < > &
rt = ReReplace(rt, "&##038;", "&", "All");
rt = ReReplace(rt, "&##060;", "<", "All");
rt = ReReplace(rt, "&##062;", ">", "All");
// supply missing semicolon at the end of & "
rt = ReReplace(rt, "&(^;)", "&\1", "All");
rt = ReReplace(rt, ""(^;)", ""\1", "All");
</cfscript>
<cfreturn rt />
</cffunction>