Superlative Solutions' HelpDesk Language
 
HomeKnowledge BaseColdFusion and ProgrammingColdFusion DeMoronize() Function
Information
Article ID33
Created On12/3/2008
Modified12/3/2008
ColdFusion DeMoronize() Function
The code snippet included in this document will remove the high-value ASCII and UniCode characters that can cause problem with Web pages. This is often caused when a person copies and paste from Microsoft Word or other word-processing applications.

The code sample is intended to be a CFC function, but can be adapted as necessary:

<cffunction name="DeMoronize" access="public" output="no" returntype="string" hint="Fixes text using Microsoft Latin-1
 Extensions, namely ASCII characters 128-160 and UniCode characters">
 <cfargument name="inputString" type="string" required="yes" hint="String to DeMoronize">
 <cfscript>
  var i = 0;
  var rt = trim(arguments.inputString);
  
  // map incompatible non-ISO characters into plausible
  // substitutes
  rt = Replace(rt, Chr(128), "&euro;", "All");
  rt = Replace(rt, Chr(8364), "&euro;", "All");
 
  rt = Replace(rt, Chr(130), ",", "All");
  rt = Replace(rt, Chr(8218), ",", "All");

  rt = Replace(rt, Chr(131), "<em>f</em>", "All");
  rt = Replace(rt, Chr(402), "<em>f</em>", "All");

  rt = Replace(rt, Chr(132), ",,", "All");
  rt = Replace(rt, Chr(8222), ",,", "All");

  rt = Replace(rt, Chr(133), "...", "All");
  rt = Replace(rt, Chr(8230), "...", "All");
   
  rt = Replace(rt, Chr(136), "^", "All");
  rt = Replace(rt, Chr(710), "^", "All");
 
  rt = Replace(rt, Chr(139), ")", "All");
  rt = Replace(rt, Chr(8249), ")", "All");

  rt = Replace(rt, Chr(140), "Oe", "All");
  rt = Replace(rt, Chr(338), "Oe", "All");
 
  rt = Replace(rt, Chr(145), "`", "All");
  rt = Replace(rt, Chr(8216), "`", "All");

  rt = Replace(rt, Chr(146), "'", "All");
  rt = Replace(rt, Chr(8217), "'", "All");

  rt = Replace(rt, Chr(147), """", "All");
  rt = Replace(rt, Chr(8220), """", "All");

  rt = Replace(rt, Chr(148), """", "All");
  rt = Replace(rt, Chr(8221), """", "All");

  rt = Replace(rt, Chr(149), "*", "All");
  rt = Replace(rt, Chr(8226), "*", "All");

  rt = Replace(rt, Chr(150), "-", "All");
  rt = Replace(rt, Chr(8211), "-", "All");

  rt = Replace(rt, Chr(151), "--", "All");
  rt = Replace(rt, Chr(8212), "--", "All");

  rt = Replace(rt, Chr(152), "~", "All");
  rt = Replace(rt, Chr(732), "~", "All");

  rt = Replace(rt, Chr(153), "&trade;", "All");
  rt = Replace(rt, Chr(8482), "&trade;", "All");
 
  rt = Replace(rt, Chr(155), ")", "All");
  rt = Replace(rt, Chr(8250), ")", "All");

  rt = Replace(rt, Chr(156), "oe", "All");
  rt = Replace(rt, Chr(339), "oe", "All");
 
  // remove any remaining ASCII 128-159 characters
  for (i = 128; i LTE 159; i = i + 1)
   rt = Replace(rt, Chr(i), "", "All");
 
  // map Latin-1 supplemental characters into
  // their &name; encoded substitutes
  rt = Replace(rt, Chr(160), "&nbsp;", "All");
 
  rt = Replace(rt, Chr(163), "&pound;", "All");
 
  rt = Replace(rt, Chr(169), "&copy;", "All");
 
  rt = Replace(rt, Chr(176), "&deg;", "All");
 
  // encode ASCII 160-255 using 'square' format
  for (i = 160; i LTE 255; i = i + 1)
   rt = REReplace(rt, "(#Chr(i)#)", "&###i#;", "All");
  
  // supply missing semicolon at end of numeric entities
  rt = ReReplace(rt, "&##([0-2][[:digit:]]{2})([^;])", "&##\1;\2", "All");
  
  // fix obscure numeric rendering of &lt; &gt; &amp;
  rt = ReReplace(rt, "&##038;", "&amp;", "All");
  rt = ReReplace(rt, "&##060;", "&lt;", "All");
  rt = ReReplace(rt, "&##062;", "&gt;", "All");
 
  // supply missing semicolon at the end of &amp; &quot;
  rt = ReReplace(rt, "&amp(^;)", "&amp;\1", "All");
  rt = ReReplace(rt, "&quot(^;)", "&quot;\1", "All");
 
 </cfscript>
 <cfreturn rt />
</cffunction>