MySQL Forums
Forum List  »  Stored Procedures

MySQL function to search and replace HTML Entities with international characters
Posted by: Charlie Armor
Date: February 06, 2009 03:33PM

HTML Entities are often contained within data written to MySQL by older PHP applications.

http://www.w3schools.com/tags/ref_entities.asp

Attached is a MySQL function I've written to allow a MySQL SELECT statement to identify HTML Entities within the data and replace them with the relevant symbol, reserved or accented character, the function replaces all the HTML Entity Names with the relevant LATIN1 characters.

mysql> SELECT HTML_UnEncode('Dr.Hübner');
+---------------------------------+
| HTML_UnEncode('Dr.Hübner') |
+---------------------------------+
| Dr.Hübner |
+---------------------------------+
1 row in set (0.00 sec)

I am using it to process data captured by a web-based PHP application, although it's straightforward to remove these values as part of a data transfer exercise I couldn't find an easier way to handle HTML entities within a SELECT statement.

I'm posting the function in case you are struggling with the same problem but also in the hope that you might be able to improve on it. If you think it can be improved on please post your suggestions below.

In practise I use REGEXP to test for the occurrence of an HTML Entity before calling the function when one is present:

SELECT
deliveryReference AS Reference,
# Test for accented characters
CASE WHEN O.deliveryName REGEXP '&[^\s]*;'
THEN HTML_UnEncode(O.deliveryName)
ELSE O.deliveryName END AS Contact,
.......


The function is below, it should be created in the source database from which the data is being drawn:

DELIMITER $$

DROP FUNCTION IF EXISTS `sourcedatabase`.`HTML_UnEncode`$$
CREATE DEFINER=`user`@`localhost` FUNCTION `HTML_UnEncode`(x VARCHAR(255)) RETURNS varchar(255) CHARSET latin1
BEGIN

DECLARE TextString VARCHAR(255) ;
SET TextString = x ;

#quotation mark
IF INSTR( x , '"' )
THEN SET TextString = REPLACE(TextString, '"','"') ;
END IF ;

#apostrophe 
IF INSTR( x , ''' )
THEN SET TextString = REPLACE(TextString, ''','"') ;
END IF ;

#ampersand
IF INSTR( x , '&' )
THEN SET TextString = REPLACE(TextString, '&','&') ;
END IF ;

#less-than
IF INSTR( x , '<' )
THEN SET TextString = REPLACE(TextString, '&lt;','<') ;
END IF ;

#greater-than
IF INSTR( x , '&gt;' )
THEN SET TextString = REPLACE(TextString, '&gt;','>') ;
END IF ;

#non-breaking space
IF INSTR( x , '&nbsp;' )
THEN SET TextString = REPLACE(TextString, '&nbsp;',' ') ;
END IF ;

#inverted exclamation mark
IF INSTR( x , '&iexcl;' )
THEN SET TextString = REPLACE(TextString, '&iexcl;','¡') ;
END IF ;

#cent
IF INSTR( x , '&cent;' )
THEN SET TextString = REPLACE(TextString, '&cent;','¢') ;
END IF ;

#pound
IF INSTR( x , '&pound;' )
THEN SET TextString = REPLACE(TextString, '&pound;','£') ;
END IF ;

#currency
IF INSTR( x , '&curren;' )
THEN SET TextString = REPLACE(TextString, '&curren;','¤') ;
END IF ;

#yen
IF INSTR( x , '&yen;' )
THEN SET TextString = REPLACE(TextString, '&yen;','¥') ;
END IF ;

#broken vertical bar
IF INSTR( x , '&brvbar;' )
THEN SET TextString = REPLACE(TextString, '&brvbar;','¦') ;
END IF ;

#section
IF INSTR( x , '&sect;' )
THEN SET TextString = REPLACE(TextString, '&sect;','§') ;
END IF ;

#spacing diaeresis
IF INSTR( x , '&uml;' )
THEN SET TextString = REPLACE(TextString, '&uml;','¨') ;
END IF ;

#copyright
IF INSTR( x , '&copy;' )
THEN SET TextString = REPLACE(TextString, '&copy;','©') ;
END IF ;

#feminine ordinal indicator
IF INSTR( x , '&ordf;' )
THEN SET TextString = REPLACE(TextString, '&ordf;','ª') ;
END IF ;

#angle quotation mark (left)
IF INSTR( x , '&laquo;' )
THEN SET TextString = REPLACE(TextString, '&laquo;','«') ;
END IF ;

#negation
IF INSTR( x , '&not;' )
THEN SET TextString = REPLACE(TextString, '&not;','¬') ;
END IF ;

#soft hyphen
IF INSTR( x , '&shy;' )
THEN SET TextString = REPLACE(TextString, '&shy;','­') ;
END IF ;

#registered trademark
IF INSTR( x , '&reg;' )
THEN SET TextString = REPLACE(TextString, '&reg;','®') ;
END IF ;

#spacing macron
IF INSTR( x , '&macr;' )
THEN SET TextString = REPLACE(TextString, '&macr;','¯') ;
END IF ;

#degree
IF INSTR( x , '&deg;' )
THEN SET TextString = REPLACE(TextString, '&deg;','°') ;
END IF ;

#plus-or-minus 
IF INSTR( x , '&plusmn;' )
THEN SET TextString = REPLACE(TextString, '&plusmn;','±') ;
END IF ;

#superscript 2
IF INSTR( x , '&sup2;' )
THEN SET TextString = REPLACE(TextString, '&sup2;','²') ;
END IF ;

#superscript 3
IF INSTR( x , '&sup3;' )
THEN SET TextString = REPLACE(TextString, '&sup3;','³') ;
END IF ;

#spacing acute
IF INSTR( x , '&acute;' )
THEN SET TextString = REPLACE(TextString, '&acute;','´') ;
END IF ;

#micro
IF INSTR( x , '&micro;' )
THEN SET TextString = REPLACE(TextString, '&micro;','µ') ;
END IF ;

#paragraph
IF INSTR( x , '&para;' )
THEN SET TextString = REPLACE(TextString, '&para;','¶') ;
END IF ;

#middle dot
IF INSTR( x , '&middot;' )
THEN SET TextString = REPLACE(TextString, '&middot;','·') ;
END IF ;

#spacing cedilla
IF INSTR( x , '&cedil;' )
THEN SET TextString = REPLACE(TextString, '&cedil;','¸') ;
END IF ;

#superscript 1
IF INSTR( x , '&sup1;' )
THEN SET TextString = REPLACE(TextString, '&sup1;','¹') ;
END IF ;

#masculine ordinal indicator
IF INSTR( x , '&ordm;' )
THEN SET TextString = REPLACE(TextString, '&ordm;','º') ;
END IF ;

#angle quotation mark (right)
IF INSTR( x , '&raquo;' )
THEN SET TextString = REPLACE(TextString, '&raquo;','»') ;
END IF ;

#fraction 1/4
IF INSTR( x , '&frac14;' )
THEN SET TextString = REPLACE(TextString, '&frac14;','¼') ;
END IF ;

#fraction 1/2
IF INSTR( x , '&frac12;' )
THEN SET TextString = REPLACE(TextString, '&frac12;','½') ;
END IF ;

#fraction 3/4
IF INSTR( x , '&frac34;' )
THEN SET TextString = REPLACE(TextString, '&frac34;','¾') ;
END IF ;

#inverted question mark
IF INSTR( x , '&iquest;' )
THEN SET TextString = REPLACE(TextString, '&iquest;','¿') ;
END IF ;

#multiplication
IF INSTR( x , '&times;' )
THEN SET TextString = REPLACE(TextString, '&times;','×') ;
END IF ;

#division
IF INSTR( x , '&divide;' )
THEN SET TextString = REPLACE(TextString, '&divide;','÷') ;
END IF ;

#capital a, grave accent
IF INSTR( x , '&Agrave;' )
THEN SET TextString = REPLACE(TextString, '&Agrave;','À') ;
END IF ;

#capital a, acute accent
IF INSTR( x , '&Aacute;' )
THEN SET TextString = REPLACE(TextString, '&Aacute;','Á') ;
END IF ;

#capital a, circumflex accent
IF INSTR( x , '&Acirc;' )
THEN SET TextString = REPLACE(TextString, '&Acirc;','Â') ;
END IF ;

#capital a, tilde
IF INSTR( x , '&Atilde;' )
THEN SET TextString = REPLACE(TextString, '&Atilde;','Ã') ;
END IF ;

#capital a, umlaut mark
IF INSTR( x , '&Auml;' )
THEN SET TextString = REPLACE(TextString, '&Auml;','Ä') ;
END IF ;

#capital a, ring
IF INSTR( x , '&Aring;' )
THEN SET TextString = REPLACE(TextString, '&Aring;','Å') ;
END IF ;

#capital ae
IF INSTR( x , '&AElig;' )
THEN SET TextString = REPLACE(TextString, '&AElig;','Æ') ;
END IF ;

#capital c, cedilla
IF INSTR( x , '&Ccedil;' )
THEN SET TextString = REPLACE(TextString, '&Ccedil;','Ç') ;
END IF ;

#capital e, grave accent
IF INSTR( x , '&Egrave;' )
THEN SET TextString = REPLACE(TextString, '&Egrave;','È') ;
END IF ;

#capital e, acute accent
IF INSTR( x , '&Eacute;' )
THEN SET TextString = REPLACE(TextString, '&Eacute;','É') ;
END IF ;

#capital e, circumflex accent
IF INSTR( x , '&Ecirc;' )
THEN SET TextString = REPLACE(TextString, '&Ecirc;','Ê') ;
END IF ;

#capital e, umlaut mark
IF INSTR( x , '&Euml;' )
THEN SET TextString = REPLACE(TextString, '&Euml;','Ë') ;
END IF ;

#capital i, grave accent
IF INSTR( x , '&Igrave;' )
THEN SET TextString = REPLACE(TextString, '&Igrave;','Ì') ;
END IF ;

#capital i, acute accent
IF INSTR( x , '&Iacute;' )
THEN SET TextString = REPLACE(TextString, '&Iacute;','Í') ;
END IF ;

#capital i, circumflex accent
IF INSTR( x , '&Icirc;' )
THEN SET TextString = REPLACE(TextString, '&Icirc;','Î') ;
END IF ;

#capital i, umlaut mark
IF INSTR( x , '&Iuml;' )
THEN SET TextString = REPLACE(TextString, '&Iuml;','Ï') ;
END IF ;

#capital eth, Icelandic
IF INSTR( x , '&ETH;' )
THEN SET TextString = REPLACE(TextString, '&ETH;','Ð') ;
END IF ;

#capital n, tilde
IF INSTR( x , '&Ntilde;' )
THEN SET TextString = REPLACE(TextString, '&Ntilde;','Ñ') ;
END IF ;

#capital o, grave accent
IF INSTR( x , '&Ograve;' )
THEN SET TextString = REPLACE(TextString, '&Ograve;','Ò') ;
END IF ;

#capital o, acute accent
IF INSTR( x , '&Oacute;' )
THEN SET TextString = REPLACE(TextString, '&Oacute;','Ó') ;
END IF ;

#capital o, circumflex accent
IF INSTR( x , '&Ocirc;' )
THEN SET TextString = REPLACE(TextString, '&Ocirc;','Ô') ;
END IF ;

#capital o, tilde
IF INSTR( x , '&Otilde;' )
THEN SET TextString = REPLACE(TextString, '&Otilde;','Õ') ;
END IF ;

#capital o, umlaut mark
IF INSTR( x , '&Ouml;' )
THEN SET TextString = REPLACE(TextString, '&Ouml;','Ö') ;
END IF ;

#capital o, slash
IF INSTR( x , '&Oslash;' )
THEN SET TextString = REPLACE(TextString, '&Oslash;','Ø') ;
END IF ;

#capital u, grave accent
IF INSTR( x , '&Ugrave;' )
THEN SET TextString = REPLACE(TextString, '&Ugrave;','Ù') ;
END IF ;

#capital u, acute accent
IF INSTR( x , '&Uacute;' )
THEN SET TextString = REPLACE(TextString, '&Uacute;','Ú') ;
END IF ;

#capital u, circumflex accent
IF INSTR( x , '&Ucirc;' )
THEN SET TextString = REPLACE(TextString, '&Ucirc;','Û') ;
END IF ;

#capital u, umlaut mark
IF INSTR( x , '&Uuml;' )
THEN SET TextString = REPLACE(TextString, '&Uuml;','Ü') ;
END IF ;

#capital y, acute accent
IF INSTR( x , '&Yacute;' )
THEN SET TextString = REPLACE(TextString, '&Yacute;','Ý') ;
END IF ;

#capital THORN, Icelandic
IF INSTR( x , '&THORN;' )
THEN SET TextString = REPLACE(TextString, '&THORN;','Þ') ;
END IF ;

#small sharp s, German
IF INSTR( x , '&szlig;' )
THEN SET TextString = REPLACE(TextString, '&szlig;','ß') ;
END IF ;

#small a, grave accent
IF INSTR( x , '&agrave;' )
THEN SET TextString = REPLACE(TextString, '&agrave;','à') ;
END IF ;

#small a, acute accent
IF INSTR( x , '&aacute;' )
THEN SET TextString = REPLACE(TextString, '&aacute;','á') ;
END IF ;

#small a, circumflex accent
IF INSTR( x , '&acirc;' )
THEN SET TextString = REPLACE(TextString, '&acirc;','â') ;
END IF ;

#small a, tilde
IF INSTR( x , '&atilde;' )
THEN SET TextString = REPLACE(TextString, '&atilde;','ã') ;
END IF ;

#small a, umlaut mark
IF INSTR( x , '&auml;' )
THEN SET TextString = REPLACE(TextString, '&auml;','ä') ;
END IF ;

#small a, ring
IF INSTR( x , '&aring;' )
THEN SET TextString = REPLACE(TextString, '&aring;','å') ;
END IF ;

#small ae
IF INSTR( x , '&aelig;' )
THEN SET TextString = REPLACE(TextString, '&aelig;','æ') ;
END IF ;

#small c, cedilla
IF INSTR( x , '&ccedil;' )
THEN SET TextString = REPLACE(TextString, '&ccedil;','ç') ;
END IF ;

#small e, grave accent
IF INSTR( x , '&egrave;' )
THEN SET TextString = REPLACE(TextString, '&egrave;','è') ;
END IF ;

#small e, acute accent
IF INSTR( x , '&eacute;' )
THEN SET TextString = REPLACE(TextString, '&eacute;','é') ;
END IF ;

#small e, circumflex accent
IF INSTR( x , '&ecirc;' )
THEN SET TextString = REPLACE(TextString, '&ecirc;','ê') ;
END IF ;

#small e, umlaut mark
IF INSTR( x , '&euml;' )
THEN SET TextString = REPLACE(TextString, '&euml;','ë') ;
END IF ;

#small i, grave accent
IF INSTR( x , '&igrave;' )
THEN SET TextString = REPLACE(TextString, '&igrave;','ì') ;
END IF ;

#small i, acute accent
IF INSTR( x , '&iacute;' )
THEN SET TextString = REPLACE(TextString, '&iacute;','í') ;
END IF ;

#small i, circumflex accent
IF INSTR( x , '&icirc;' )
THEN SET TextString = REPLACE(TextString, '&icirc;','î') ;
END IF ;

#small i, umlaut mark
IF INSTR( x , '&iuml;' )
THEN SET TextString = REPLACE(TextString, '&iuml;','ï') ;
END IF ;

#small eth, Icelandic
IF INSTR( x , '&eth;' )
THEN SET TextString = REPLACE(TextString, '&eth;','ð') ;
END IF ;

#small n, tilde
IF INSTR( x , '&ntilde;' )
THEN SET TextString = REPLACE(TextString, '&ntilde;','ñ') ;
END IF ;

#small o, grave accent
IF INSTR( x , '&ograve;' )
THEN SET TextString = REPLACE(TextString, '&ograve;','ò') ;
END IF ;

#small o, acute accent
IF INSTR( x , '&oacute;' )
THEN SET TextString = REPLACE(TextString, '&oacute;','ó') ;
END IF ;

#small o, circumflex accent
IF INSTR( x , '&ocirc;' )
THEN SET TextString = REPLACE(TextString, '&ocirc;','ô') ;
END IF ;

#small o, tilde
IF INSTR( x , '&otilde;' )
THEN SET TextString = REPLACE(TextString, '&otilde;','õ') ;
END IF ;

#small o, umlaut mark
IF INSTR( x , '&ouml;' )
THEN SET TextString = REPLACE(TextString, '&ouml;','ö') ;
END IF ;

#small o, slash
IF INSTR( x , '&oslash;' )
THEN SET TextString = REPLACE(TextString, '&oslash;','ø') ;
END IF ;

#small u, grave accent
IF INSTR( x , '&ugrave;' )
THEN SET TextString = REPLACE(TextString, '&ugrave;','ù') ;
END IF ;

#small u, acute accent
IF INSTR( x , '&uacute;' )
THEN SET TextString = REPLACE(TextString, '&uacute;','ú') ;
END IF ;

#small u, circumflex accent
IF INSTR( x , '&ucirc;' )
THEN SET TextString = REPLACE(TextString, '&ucirc;','û') ;
END IF ;

#small u, umlaut mark
IF INSTR( x , '&uuml;' )
THEN SET TextString = REPLACE(TextString, '&uuml;','ü') ;
END IF ;

#small y, acute accent
IF INSTR( x , '&yacute;' )
THEN SET TextString = REPLACE(TextString, '&yacute;','ý') ;
END IF ;

#small thorn, Icelandic
IF INSTR( x , '&thorn;' )
THEN SET TextString = REPLACE(TextString, '&thorn;','þ') ;
END IF ;

#small y, umlaut mark
IF INSTR( x , '&yuml;' )
THEN SET TextString = REPLACE(TextString, '&yuml;','ÿ') ;
END IF ;

RETURN TextString ;

END$$

DELIMITER ;



Edited 6 time(s). Last edit at 02/06/2009 03:50PM by Charlie Armor.

Options: ReplyQuote


Subject
Views
Written By
Posted
MySQL function to search and replace HTML Entities with international characters
84481
February 06, 2009 03:33PM


Sorry, you can't reply to this topic. It has been closed.

Content reproduced on this site is the property of the respective copyright holders. It is not reviewed in advance by Oracle and does not necessarily represent the opinion of Oracle or any other party.