Link to home
Start Free TrialLog in
Avatar of SweatCoder
SweatCoderFlag for United States of America

asked on

UTF-8 encoding

there's a java class that does UTF-8 encoding, but I need it to work in javascript. can anyone convert the following so that it will work in a javascript function? i tried for about an hour, and i kept getting errors because i don't think js supports hex[] call nor append in the way it's being used, and eventually i gave up. I changed all the appends to +=, but I kept getting stuck on the hex stuff.

I got the code below from http://www.w3.org/International/O-URL-code.html

/**
 * Provides a method to encode any string into a URL-safe
 * form.
 * Non-ASCII characters are first encoded as sequences of
 * two or three bytes, using the UTF-8 algorithm, before being
 * encoded as %HH escapes.
 */
public class URLUTF8Encoder
{

  final static String[] hex = {
    "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
    "%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
    "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
    "%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
    "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
    "%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
    "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
    "%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
    "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
    "%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
    "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
    "%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
    "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
    "%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
    "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
    "%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
    "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
    "%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
    "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
    "%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
    "%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
    "%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
    "%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
    "%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
    "%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
    "%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
    "%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
    "%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
    "%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
    "%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
    "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
    "%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
  };

  /**
   * Encode a string to the "x-www-form-urlencoded" form, enhanced
   * with the UTF-8-in-URL proposal. This is what happens:
   *
   * <ul>
   * <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
   *        and '0' through '9' remain the same.
   *
   * <li><p>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
   *
   * <li><p>The space character ' ' is converted into a plus sign '+'.
   *
   * <li><p>All other ASCII characters are converted into the
   *        3-character string "%xy", where xy is
   *        the two-digit hexadecimal representation of the character
   *        code
   *
   * <li><p>All non-ASCII characters are encoded in two steps: first
   *        to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
   *        secondly each of these bytes is encoded as "%xx".
   * </ul>
   *
   * @param s The string to be encoded
   * @return The encoded string
   */
  public static String encode(String s)
  {
    StringBuffer sbuf = new StringBuffer();
    int len = s.length();
    for (int i = 0; i < len; i++) {
      int ch = s.charAt(i);
      if ('A' <= ch && ch <= 'Z') {            // 'A'..'Z'
        sbuf.append((char)ch);
      } else if ('a' <= ch && ch <= 'z') {      // 'a'..'z'
             sbuf.append((char)ch);
      } else if ('0' <= ch && ch <= '9') {      // '0'..'9'
             sbuf.append((char)ch);
      } else if (ch == ' ') {                  // space
             sbuf.append('+');
      } else if (ch == '-' || ch == '_'            // unreserved
          || ch == '.' || ch == '!'
          || ch == '~' || ch == '*'
          || ch == '\'' || ch == '('
          || ch == ')') {
        sbuf.append((char)ch);
      } else if (ch <= 0x007f) {            // other ASCII
             sbuf.append(hex[ch]);
      } else if (ch <= 0x07FF) {            // non-ASCII <= 0x7FF
             sbuf.append(hex[0xc0 | (ch >> 6)]);
             sbuf.append(hex[0x80 | (ch & 0x3F)]);
      } else {                              // 0x7FF < ch <= 0xFFFF
             sbuf.append(hex[0xe0 | (ch >> 12)]);
             sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
             sbuf.append(hex[0x80 | (ch & 0x3F)]);
      }
    }
    return sbuf.toString();
  }
}
Avatar of GwynforWeb
GwynforWeb
Flag of Canada image

</script>
alert(excape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
sorry meant to say some thing. I think the function you want is already in JS ie. escape()  and unescape() will convert back
typos

<script>
alert(escape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
This is what you need

<script>
function convert(ch){
 if (/[ - _ . ! ~ * ' ( )]/.test(ch)) return ch
 if (ch==' ') return '+'
 return escape(ch)
}
alert('A  '+convert('A'))
alert('a  '+convert('a'))
alert(' '+convert(' '))
alert('(  '+convert('('))
alert('&  '+convert('&'))
alert('\\n  '+convert('\n'))
alert('\\r  '+convert('\r'))
</script>
sorry this

function convert(ch){
 if (/[-_.!~*'()]/.test(ch)) return ch
 if (ch==' ') return '+'
 return escape(ch)
}
Avatar of devic
<script>
var hex =[
   "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
   "%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
   "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
   "%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
   "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
   "%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
   "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
   "%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
   "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
   "%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
   "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
   "%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
   "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
   "%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
   "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
   "%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
   "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
   "%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
   "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
   "%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
   "%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
   "%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
   "%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
   "%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
   "%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
   "%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
   "%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
   "%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
   "%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
   "%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
   "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
   "%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
 ];

function getChar(char)
{
      for(var i=0;i<hex.length;i++)
      {
            if(unescape(hex[i])==char)
            return hex[i];
      }
      return char;
}
function convert(str)
{
      str=str.split("");
      var newstr=[];
      for(var i=0;i<str.length;i++)
      {
            newstr[i]=getChar(str[i]);
      }
      return newstr.join("");
}

var testString="Congratulations! You have now earned a Brainbench Master Certification in JavaScript 1.5, which is valid for 3 years from today's date."
alert(testString+"==>\n" + convert(testString))


</script>
ASKER CERTIFIED SOLUTION
Avatar of GwynforWeb
GwynforWeb
Flag of Canada image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
hey Gwyn who said this?
Gwyn, "hex" array is above:

for(var i=0;i<hex.length;i++)
document.write(unescape(hex[i]) + "<hr>")

i see inside: 0-9a-zA-z-_.!~*'()

and i thought that all chars must be converted

>>"hey Gwyn who said this?"

look at the comment statements in the function in the question.  

( I gather you are doing another all nighter :-)  )
ach, you are right, i didn't read good the question :)
the above function return the 2 byte number for non ascii chars as well. Lets see if these show (probably not)

&#1081;&#1073;&#1083;&#1100; &#1081;&#1076;&#1073; &#1095;&#1083;&#1089; &#1079;&#1097;&#1074;&#1088;&#1094;&#1089; &#946;&#960;&#968;&#951;&#966; &#961;&#953;&#955;&#954; &#964;&#955;&#959;&#972; &#948;&#960;&#947;&#952; &#968;&#945;&#948;
no they do not
Avatar of SweatCoder

ASKER

thanks for all the great feedback. the objective is to convert Japanese (Kanji) text into URL-legal text, and I was told it needed to be UTF-8 format. i'm running some tests based on your examples, and i'll get back to you tomorrow.

thanks!!
the last function I posted will convert your Japanease text to its 2 byte escaped form
gwyn, by your "last" do you mean:

<script>
function convert(str){
  function conChar(ch){
   if (/[-_.!~*'()]/.test(ch)) return ch
   if (ch==' ') return '+'
   return escape(ch)
  }
strArr=str.split('')
for (i=0;i<str.length;i++)
  strArr[i]=conChar(strArr[i])
return strArr.join('')
}
alert(convert('%^& Hello World -_.!~*'))
</script>

i'm getting different results than I got on my Java post here:
https://www.experts-exchange.com/questions/21060607/UTF-8-encoding-URL-safe.html#11565712

. . .and i'm inclined to think that the solution offered in the link above is correct, but i want every perspective. your result is shorter than his. that doesn't mean anything of itself, but i'm just looking at different angles. the filename i'm working with is this:

AAAAAâKâCâhâuâbâN1.pdf  (looks different if Kanji is enabled on your OS, but this is the closest representation i guess)

yours returns: AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf
x4u at the Java post returns: AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf

yours just seems a little too short relative to all the chars that must be accounted for and encoded.

what do you think? it's also interesting to note as pointed out by x4u in the other post, that there's a ready-made jscript function called encodeURIComponent() that returns exactly the same result as the lengthy UTF function, but I can't use it because I need a server-side solution in my ASP code--serverside the function doesn't seem to work, it throws errors. So I use server-side javascript to test the various suggestions.

i'm interested to hear your thoughts.
AAAAAâKâCâhâuâbâN1.pdf

no I think mine is right A K C h b N have not been coverted as required and the remaining 6 â's have been converted to %E2, count them and run this which converts back

<script>
     alert(unescape('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
     alert(unescape('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
</script>

time for bed, good night :-)
Avatar of x4u
x4u

The difference here is that escape uses another encoding than utf-8. It encodes all ISO8859-1 characers (the lower 256 unicode chars) with a %xx sequence while it uses %uxxxx for all character codes >= 256. This behaviour is outdated and thats why there there is now encodeURIComponent.

Try this and you will see that new the 2nd one is right:
<script>
     alert(encodeURIComponent('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
     alert(encodeURIComponent('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
</script>
Ok, here is a litte downsized version, that creates the array with a loop and uses with GwynforWeb's regexp idea.

<html>
<head>
<!-- SAVE AS UTF-8!! (copy paste to notepad, save as, choose encoding:utf-8)
     or remove the meta header field!! -->

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Encoder Test</title></head>
<body>
<pre>
<script>
var digits ="0123456789abcdef";
var hex = new Array( 256 );
for( var idx = 0; idx < 256; idx++ )
    hex[ idx ] = "%" + digits.charAt( idx >> 4 ) + digits.charAt( idx & 0xf );

function encode( s )
{
    var sbuf = "";
    var len = s.length;
    for( var i = 0; i < len; i++ ) {
      var ch = s.charAt(i);
     
      if( /[A-Za-z0-9-_.!~*'()]/.test( ch ) ) {
        sbuf += ch;
      } else{
         var cc = s.charCodeAt(i);
         if (cc <= 0x007f) {          // other ASCII
            sbuf += hex[cc];
         } else if (cc <= 0x07FF) {          // non-ASCII <= 0x7FF
            sbuf += hex[0xc0 | (cc >> 6)]
                 + hex[0x80 | (cc & 0x3F)];
         } else {                         // 0x7FF < ch <= 0xFFFF
            sbuf += hex[0xe0 | (cc >> 12)]
                 + hex[0x80 | ((cc >> 6) & 0x3F)]
                 + hex[0x80 | (cc & 0x3F)];
         }
      }
    }
    return sbuf;
}

document.writeln( "encode: " + encode("Test ä123ß xyz-_+?") );
document.writeln( "encuri: " + encodeURIComponent("Test ä123ß xyz-_+?") );
document.writeln();
document.writeln( "unescape");
document.writeln( unescape('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
document.writeln( unescape('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
document.writeln( "decodeURIComponent");
// error: document.writeln( decodeURIComponent('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
document.writeln( decodeURIComponent('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
document.writeln();
document.writeln( "encode: " + encode("AAAAAâKâCâhâuâbâN1.pdf") );
document.writeln( "encuri: " + encodeURIComponent("AAAAAâKâCâhâuâbâN1.pdf") );
document.writeln();
document.writeln( "some text in russian:" );
document.writeln( "encode: " + encode("&#1069;&#1082;&#1089;&#1090;&#1088;&#1072;&#1089;&#1077;&#1085;&#1089;&#1099; &#1076;&#1077;&#1081;&#1089;&#1090;&#1074;&#1080;&#1090;&#1077;&#1083;&#1100;&#1085;&#1086;") );
document.writeln( "decode: " + decodeURIComponent( "%d0%ad%d0%ba%d1%81%d1%82%d1%80%d0%b0%d1%81%d0%b5%d0%bd%d1%81" +
                   "%d1%8b%20%d0%b4%d0%b5%d0%b9%d1%81%d1%82%d0%b2%d0%b8%d1%82%d0%b5%d0%bb%d1%8c%d0%bd%d0%be" ) );

</script>
</pre>
</body>
</html>
Well actually you don't need to save it as UTF-8 anymore as EE's server has converted the russian text into HTML character entities. But it still works.
GwynforWeb, thanks for contributing to the more compact version of the final solution. and thanks x4u for the clarifications and revisions.