is there a mysql function to decode html entities?
i was wondering if there is a MYSQL function to decode text with html entities. I have seen some approaches using REPLACE but it looks kinda hard to manage all the entities.
Solution 1:
You can create function like below
DELIMITER $$
DROP FUNCTION IF EXISTS `HTML_UnEncode`$$
CREATE FUNCTION `HTML_UnEncode`(X VARCHAR(255)) RETURNS VARCHAR(255) CHARSET latin1 DETERMINISTIC
BEGIN
DECLARE TextString VARCHAR(255) ;
SET TextString = X ;
#quotation mark
IF INSTR( X , '"' )
THEN SET TextString = REPLACE(TextString, '"','"') ;
END IF ;
#apostrophe
IF INSTR( X , ''' )
THEN SET TextString = REPLACE(TextString, ''','"') ;
END IF ;
#ampersand
IF INSTR( X , '&' )
THEN SET TextString = REPLACE(TextString, '&','&') ;
END IF ;
#less-than
IF INSTR( X , '<' )
THEN SET TextString = REPLACE(TextString, '<','<') ;
END IF ;
#greater-than
IF INSTR( X , '>' )
THEN SET TextString = REPLACE(TextString, '>','>') ;
END IF ;
#non-breaking space
IF INSTR( X , ' ' )
THEN SET TextString = REPLACE(TextString, ' ',' ') ;
END IF ;
#inverted exclamation mark
IF INSTR( X , '¡' )
THEN SET TextString = REPLACE(TextString, '¡','¡') ;
END IF ;
#cent
IF INSTR( X , '¢' )
THEN SET TextString = REPLACE(TextString, '¢','¢') ;
END IF ;
#pound
IF INSTR( X , '£' )
THEN SET TextString = REPLACE(TextString, '£','£') ;
END IF ;
#currency
IF INSTR( X , '¤' )
THEN SET TextString = REPLACE(TextString, '¤','¤') ;
END IF ;
#yen
IF INSTR( X , '¥' )
THEN SET TextString = REPLACE(TextString, '¥','¥') ;
END IF ;
#broken vertical bar
IF INSTR( X , '¦' )
THEN SET TextString = REPLACE(TextString, '¦','¦') ;
END IF ;
#section
IF INSTR( X , '§' )
THEN SET TextString = REPLACE(TextString, '§','§') ;
END IF ;
#spacing diaeresis
IF INSTR( X , '¨' )
THEN SET TextString = REPLACE(TextString, '¨','¨') ;
END IF ;
#copyright
IF INSTR( X , '©' )
THEN SET TextString = REPLACE(TextString, '©','©') ;
END IF ;
#feminine ordinal indicator
IF INSTR( X , 'ª' )
THEN SET TextString = REPLACE(TextString, 'ª','ª') ;
END IF ;
#angle quotation mark (left)
IF INSTR( X , '«' )
THEN SET TextString = REPLACE(TextString, '«','«') ;
END IF ;
#negation
IF INSTR( X , '¬' )
THEN SET TextString = REPLACE(TextString, '¬','¬') ;
END IF ;
#soft hyphen
IF INSTR( X , '­' )
THEN SET TextString = REPLACE(TextString, '­','') ;
END IF ;
#registered trademark
IF INSTR( X , '®' )
THEN SET TextString = REPLACE(TextString, '®','®') ;
END IF ;
#spacing macron
IF INSTR( X , '¯' )
THEN SET TextString = REPLACE(TextString, '¯','¯') ;
END IF ;
#degree
IF INSTR( X , '°' )
THEN SET TextString = REPLACE(TextString, '°','°') ;
END IF ;
#plus-or-minus
IF INSTR( X , '±' )
THEN SET TextString = REPLACE(TextString, '±','±') ;
END IF ;
#superscript 2
IF INSTR( X , '²' )
THEN SET TextString = REPLACE(TextString, '²','²') ;
END IF ;
#superscript 3
IF INSTR( X , '³' )
THEN SET TextString = REPLACE(TextString, '³','³') ;
END IF ;
#spacing acute
IF INSTR( X , '´' )
THEN SET TextString = REPLACE(TextString, '´','´') ;
END IF ;
#micro
IF INSTR( X , 'µ' )
THEN SET TextString = REPLACE(TextString, 'µ','µ') ;
END IF ;
#paragraph
IF INSTR( X , '¶' )
THEN SET TextString = REPLACE(TextString, '¶','¶') ;
END IF ;
#middle dot
IF INSTR( X , '·' )
THEN SET TextString = REPLACE(TextString, '·','·') ;
END IF ;
#spacing cedilla
IF INSTR( X , '¸' )
THEN SET TextString = REPLACE(TextString, '¸','¸') ;
END IF ;
#superscript 1
IF INSTR( X , '¹' )
THEN SET TextString = REPLACE(TextString, '¹','¹') ;
END IF ;
#masculine ordinal indicator
IF INSTR( X , 'º' )
THEN SET TextString = REPLACE(TextString, 'º','º') ;
END IF ;
#angle quotation mark (right)
IF INSTR( X , '»' )
THEN SET TextString = REPLACE(TextString, '»','»') ;
END IF ;
#fraction 1/4
IF INSTR( X , '¼' )
THEN SET TextString = REPLACE(TextString, '¼','¼') ;
END IF ;
#fraction 1/2
IF INSTR( X , '½' )
THEN SET TextString = REPLACE(TextString, '½','½') ;
END IF ;
#fraction 3/4
IF INSTR( X , '¾' )
THEN SET TextString = REPLACE(TextString, '¾','¾') ;
END IF ;
#inverted question mark
IF INSTR( X , '¿' )
THEN SET TextString = REPLACE(TextString, '¿','¿') ;
END IF ;
#multiplication
IF INSTR( X , '×' )
THEN SET TextString = REPLACE(TextString, '×','×') ;
END IF ;
#division
IF INSTR( X , '÷' )
THEN SET TextString = REPLACE(TextString, '÷','÷') ;
END IF ;
#capital a, grave accent
IF INSTR( X , 'À' )
THEN SET TextString = REPLACE(TextString, 'À','À') ;
END IF ;
#capital a, acute accent
IF INSTR( X , 'Á' )
THEN SET TextString = REPLACE(TextString, 'Á','Á') ;
END IF ;
#capital a, circumflex accent
IF INSTR( X , 'Â' )
THEN SET TextString = REPLACE(TextString, 'Â','Â') ;
END IF ;
#capital a, tilde
IF INSTR( X , 'Ã' )
THEN SET TextString = REPLACE(TextString, 'Ã','Ã') ;
END IF ;
#capital a, umlaut mark
IF INSTR( X , 'Ä' )
THEN SET TextString = REPLACE(TextString, 'Ä','Ä') ;
END IF ;
#capital a, ring
IF INSTR( X , 'Å' )
THEN SET TextString = REPLACE(TextString, 'Å','Å') ;
END IF ;
#capital ae
IF INSTR( X , 'Æ' )
THEN SET TextString = REPLACE(TextString, 'Æ','Æ') ;
END IF ;
#capital c, cedilla
IF INSTR( X , 'Ç' )
THEN SET TextString = REPLACE(TextString, 'Ç','Ç') ;
END IF ;
#capital e, grave accent
IF INSTR( X , 'È' )
THEN SET TextString = REPLACE(TextString, 'È','È') ;
END IF ;
#capital e, acute accent
IF INSTR( X , 'É' )
THEN SET TextString = REPLACE(TextString, 'É','É') ;
END IF ;
#capital e, circumflex accent
IF INSTR( X , 'Ê' )
THEN SET TextString = REPLACE(TextString, 'Ê','Ê') ;
END IF ;
#capital e, umlaut mark
IF INSTR( X , 'Ë' )
THEN SET TextString = REPLACE(TextString, 'Ë','Ë') ;
END IF ;
#capital i, grave accent
IF INSTR( X , 'Ì' )
THEN SET TextString = REPLACE(TextString, 'Ì','Ì') ;
END IF ;
#capital i, acute accent
IF INSTR( X , 'Í' )
THEN SET TextString = REPLACE(TextString, 'Í','Í') ;
END IF ;
#capital i, circumflex accent
IF INSTR( X , 'Î' )
THEN SET TextString = REPLACE(TextString, 'Î','Î') ;
END IF ;
#capital i, umlaut mark
IF INSTR( X , 'Ï' )
THEN SET TextString = REPLACE(TextString, 'Ï','Ï') ;
END IF ;
#capital eth, Icelandic
IF INSTR( X , 'Ð' )
THEN SET TextString = REPLACE(TextString, 'Ð','Ð') ;
END IF ;
#capital n, tilde
IF INSTR( X , 'Ñ' )
THEN SET TextString = REPLACE(TextString, 'Ñ','Ñ') ;
END IF ;
#capital o, grave accent
IF INSTR( X , 'Ò' )
THEN SET TextString = REPLACE(TextString, 'Ò','Ò') ;
END IF ;
#capital o, acute accent
IF INSTR( X , 'Ó' )
THEN SET TextString = REPLACE(TextString, 'Ó','Ó') ;
END IF ;
#capital o, circumflex accent
IF INSTR( X , 'Ô' )
THEN SET TextString = REPLACE(TextString, 'Ô','Ô') ;
END IF ;
#capital o, tilde
IF INSTR( X , 'Õ' )
THEN SET TextString = REPLACE(TextString, 'Õ','Õ') ;
END IF ;
#capital o, umlaut mark
IF INSTR( X , 'Ö' )
THEN SET TextString = REPLACE(TextString, 'Ö','Ö') ;
END IF ;
#capital o, slash
IF INSTR( X , 'Ø' )
THEN SET TextString = REPLACE(TextString, 'Ø','Ø') ;
END IF ;
#capital u, grave accent
IF INSTR( X , 'Ù' )
THEN SET TextString = REPLACE(TextString, 'Ù','Ù') ;
END IF ;
#capital u, acute accent
IF INSTR( X , 'Ú' )
THEN SET TextString = REPLACE(TextString, 'Ú','Ú') ;
END IF ;
#capital u, circumflex accent
IF INSTR( X , 'Û' )
THEN SET TextString = REPLACE(TextString, 'Û','Û') ;
END IF ;
#capital u, umlaut mark
IF INSTR( X , 'Ü' )
THEN SET TextString = REPLACE(TextString, 'Ü','Ü') ;
END IF ;
#capital y, acute accent
IF INSTR( X , 'Ý' )
THEN SET TextString = REPLACE(TextString, 'Ý','Ý') ;
END IF ;
#capital THORN, Icelandic
IF INSTR( X , 'Þ' )
THEN SET TextString = REPLACE(TextString, 'Þ','Þ') ;
END IF ;
#small sharp s, German
IF INSTR( X , 'ß' )
THEN SET TextString = REPLACE(TextString, 'ß','ß') ;
END IF ;
#small a, grave accent
IF INSTR( X , 'à' )
THEN SET TextString = REPLACE(TextString, 'à','à') ;
END IF ;
#small a, acute accent
IF INSTR( X , 'á' )
THEN SET TextString = REPLACE(TextString, 'á','á') ;
END IF ;
#small a, circumflex accent
IF INSTR( X , 'â' )
THEN SET TextString = REPLACE(TextString, 'â','â') ;
END IF ;
#small a, tilde
IF INSTR( X , 'ã' )
THEN SET TextString = REPLACE(TextString, 'ã','ã') ;
END IF ;
#small a, umlaut mark
IF INSTR( X , 'ä' )
THEN SET TextString = REPLACE(TextString, 'ä','ä') ;
END IF ;
#small a, ring
IF INSTR( X , 'å' )
THEN SET TextString = REPLACE(TextString, 'å','å') ;
END IF ;
#small ae
IF INSTR( X , 'æ' )
THEN SET TextString = REPLACE(TextString, 'æ','æ') ;
END IF ;
#small c, cedilla
IF INSTR( X , 'ç' )
THEN SET TextString = REPLACE(TextString, 'ç','ç') ;
END IF ;
#small e, grave accent
IF INSTR( X , 'è' )
THEN SET TextString = REPLACE(TextString, 'è','è') ;
END IF ;
#small e, acute accent
IF INSTR( X , 'é' )
THEN SET TextString = REPLACE(TextString, 'é','é') ;
END IF ;
#small e, circumflex accent
IF INSTR( X , 'ê' )
THEN SET TextString = REPLACE(TextString, 'ê','ê') ;
END IF ;
#small e, umlaut mark
IF INSTR( X , 'ë' )
THEN SET TextString = REPLACE(TextString, 'ë','ë') ;
END IF ;
#small i, grave accent
IF INSTR( X , 'ì' )
THEN SET TextString = REPLACE(TextString, 'ì','ì') ;
END IF ;
#small i, acute accent
IF INSTR( X , 'í' )
THEN SET TextString = REPLACE(TextString, 'í','í') ;
END IF ;
#small i, circumflex accent
IF INSTR( X , 'î' )
THEN SET TextString = REPLACE(TextString, 'î','î') ;
END IF ;
#small i, umlaut mark
IF INSTR( X , 'ï' )
THEN SET TextString = REPLACE(TextString, 'ï','ï') ;
END IF ;
#small eth, Icelandic
IF INSTR( X , 'ð' )
THEN SET TextString = REPLACE(TextString, 'ð','ð') ;
END IF ;
#small n, tilde
IF INSTR( X , 'ñ' )
THEN SET TextString = REPLACE(TextString, 'ñ','ñ') ;
END IF ;
#small o, grave accent
IF INSTR( X , 'ò' )
THEN SET TextString = REPLACE(TextString, 'ò','ò') ;
END IF ;
#small o, acute accent
IF INSTR( X , 'ó' )
THEN SET TextString = REPLACE(TextString, 'ó','ó') ;
END IF ;
#small o, circumflex accent
IF INSTR( X , 'ô' )
THEN SET TextString = REPLACE(TextString, 'ô','ô') ;
END IF ;
#small o, tilde
IF INSTR( X , 'õ' )
THEN SET TextString = REPLACE(TextString, 'õ','õ') ;
END IF ;
#small o, umlaut mark
IF INSTR( X , 'ö' )
THEN SET TextString = REPLACE(TextString, 'ö','ö') ;
END IF ;
#small o, slash
IF INSTR( X , 'ø' )
THEN SET TextString = REPLACE(TextString, 'ø','ø') ;
END IF ;
#small u, grave accent
IF INSTR( X , 'ù' )
THEN SET TextString = REPLACE(TextString, 'ù','ù') ;
END IF ;
#small u, acute accent
IF INSTR( X , 'ú' )
THEN SET TextString = REPLACE(TextString, 'ú','ú') ;
END IF ;
#small u, circumflex accent
IF INSTR( X , 'û' )
THEN SET TextString = REPLACE(TextString, 'û','û') ;
END IF ;
#small u, umlaut mark
IF INSTR( X , 'ü' )
THEN SET TextString = REPLACE(TextString, 'ü','ü') ;
END IF ;
#small y, acute accent
IF INSTR( X , 'ý' )
THEN SET TextString = REPLACE(TextString, 'ý','ý') ;
END IF ;
#small thorn, Icelandic
IF INSTR( X , 'þ' )
THEN SET TextString = REPLACE(TextString, 'þ','þ') ;
END IF ;
#small y, umlaut mark
IF INSTR( X , 'ÿ' )
THEN SET TextString = REPLACE(TextString, 'ÿ','ÿ') ;
END IF ;
RETURN TextString ;
END$$
DELIMITER ;
And execute below query :
SELECT HTML_UnEncode('this is a <a>test, nothing more');
Solution 2:
The following function can be used to decode numeric entities like 〹
CREATE FUNCTION entity_decode(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8
NO SQL
DETERMINISTIC
BEGIN
DECLARE tmp TEXT CHARSET utf8 DEFAULT txt;
DECLARE entity TEXT CHARSET utf8;
DECLARE pos1 INT DEFAULT 1;
DECLARE pos2 INT;
DECLARE codepoint INT;
IF txt IS NULL THEN
RETURN NULL;
END IF;
LOOP
SET pos1 = LOCATE('&#', tmp, pos1);
IF pos1 = 0 THEN
RETURN tmp;
END IF;
SET pos2 = LOCATE(';', tmp, pos1 + 2);
IF pos2 > pos1 THEN
SET entity = SUBSTRING(tmp, pos1, pos2 - pos1 + 1);
IF entity REGEXP '^&#[[:digit:]]+;$' THEN
SET codepoint = CAST(SUBSTRING(entity, 3, pos2 - pos1 - 2) AS UNSIGNED);
IF codepoint > 31 THEN
SET tmp = CONCAT(LEFT(tmp, pos1 - 1), CHAR(codepoint USING utf32), SUBSTRING(tmp, pos2 + 1));
END IF;
END IF;
END IF;
SET pos1 = pos1 + 1;
END LOOP;
END