UTF-8 encoding

there's a java class that does UTF-8 encoding, but I need it to work in javascript. can anyone convert the following so that it will work in a javascript function? i tried for about an hour, and i kept getting errors because i don't think js supports hex[] call nor append in the way it's being used, and eventually i gave up. I changed all the appends to +=, but I kept getting stuck on the hex stuff.

I got the code below from http://www.w3.org/International/O-URL-code.html

/**
 * Provides a method to encode any string into a URL-safe
 * form.
 * Non-ASCII characters are first encoded as sequences of
 * two or three bytes, using the UTF-8 algorithm, before being
 * encoded as %HH escapes.
 */
public class URLUTF8Encoder
{

  final static String[] hex = {
    "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
    "%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
    "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
    "%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
    "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
    "%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
    "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
    "%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
    "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
    "%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
    "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
    "%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
    "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
    "%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
    "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
    "%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
    "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
    "%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
    "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
    "%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
    "%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
    "%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
    "%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
    "%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
    "%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
    "%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
    "%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
    "%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
    "%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
    "%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
    "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
    "%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
  };

  /**
   * Encode a string to the "x-www-form-urlencoded" form, enhanced
   * with the UTF-8-in-URL proposal. This is what happens:
   *
   * <ul>
   * <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
   *        and '0' through '9' remain the same.
   *
   * <li><p>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
   *
   * <li><p>The space character ' ' is converted into a plus sign '+'.
   *
   * <li><p>All other ASCII characters are converted into the
   *        3-character string "%xy", where xy is
   *        the two-digit hexadecimal representation of the character
   *        code
   *
   * <li><p>All non-ASCII characters are encoded in two steps: first
   *        to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
   *        secondly each of these bytes is encoded as "%xx".
   * </ul>
   *
   * @param s The string to be encoded
   * @return The encoded string
   */
  public static String encode(String s)
  {
    StringBuffer sbuf = new StringBuffer();
    int len = s.length();
    for (int i = 0; i < len; i++) {
      int ch = s.charAt(i);
      if ('A' <= ch && ch <= 'Z') {            // 'A'..'Z'
        sbuf.append((char)ch);
      } else if ('a' <= ch && ch <= 'z') {      // 'a'..'z'
             sbuf.append((char)ch);
      } else if ('0' <= ch && ch <= '9') {      // '0'..'9'
             sbuf.append((char)ch);
      } else if (ch == ' ') {                  // space
             sbuf.append('+');
      } else if (ch == '-' || ch == '_'            // unreserved
          || ch == '.' || ch == '!'
          || ch == '~' || ch == '*'
          || ch == '\'' || ch == '('
          || ch == ')') {
        sbuf.append((char)ch);
      } else if (ch <= 0x007f) {            // other ASCII
             sbuf.append(hex[ch]);
      } else if (ch <= 0x07FF) {            // non-ASCII <= 0x7FF
             sbuf.append(hex[0xc0 | (ch >> 6)]);
             sbuf.append(hex[0x80 | (ch & 0x3F)]);
      } else {                              // 0x7FF < ch <= 0xFFFF
             sbuf.append(hex[0xe0 | (ch >> 12)]);
             sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
             sbuf.append(hex[0x80 | (ch & 0x3F)]);
      }
    }
    return sbuf.toString();
  }
}
LVL 11
SweatCoderAsked:
Who is Participating?
I wear a lot of hats...

"The solutions and answers provided on Experts Exchange have been extremely helpful to me over the last few years. I wear a lot of hats - Developer, Database Administrator, Help Desk, etc., so I know a lot of things but not a lot about one thing. Experts Exchange gives me answers from people who do know a lot about one thing, in a easy to use platform." -Todd S.

GwynforWebCommented:
</script>
alert(excape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
GwynforWebCommented:
sorry meant to say some thing. I think the function you want is already in JS ie. escape()  and unescape() will convert back
GwynforWebCommented:
typos

<script>
alert(escape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
Become a Certified Penetration Testing Engineer

This CPTE Certified Penetration Testing Engineer course covers everything you need to know about becoming a Certified Penetration Testing Engineer. Career Path: Professional roles include Ethical Hackers, Security Consultants, System Administrators, and Chief Security Officers.

GwynforWebCommented:
This is what you need

<script>
function convert(ch){
 if (/[ - _ . ! ~ * ' ( )]/.test(ch)) return ch
 if (ch==' ') return '+'
 return escape(ch)
}
alert('A  '+convert('A'))
alert('a  '+convert('a'))
alert(' '+convert(' '))
alert('(  '+convert('('))
alert('&  '+convert('&'))
alert('\\n  '+convert('\n'))
alert('\\r  '+convert('\r'))
</script>
GwynforWebCommented:
sorry this

function convert(ch){
 if (/[-_.!~*'()]/.test(ch)) return ch
 if (ch==' ') return '+'
 return escape(ch)
}
devicCommented:
<script>
var hex =[
   "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
   "%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
   "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
   "%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
   "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
   "%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
   "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
   "%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
   "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
   "%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
   "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
   "%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
   "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
   "%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
   "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
   "%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
   "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
   "%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
   "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
   "%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
   "%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
   "%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
   "%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
   "%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
   "%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
   "%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
   "%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
   "%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
   "%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
   "%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
   "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
   "%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
 ];

function getChar(char)
{
      for(var i=0;i<hex.length;i++)
      {
            if(unescape(hex[i])==char)
            return hex[i];
      }
      return char;
}
function convert(str)
{
      str=str.split("");
      var newstr=[];
      for(var i=0;i<str.length;i++)
      {
            newstr[i]=getChar(str[i]);
      }
      return newstr.join("");
}

var testString="Congratulations! You have now earned a Brainbench Master Certification in JavaScript 1.5, which is valid for 3 years from today's date."
alert(testString+"==>\n" + convert(testString))


</script>
GwynforWebCommented:
Devic 0-9a-zA-z-_.!~*'() are to be left unchanged

Here is mine for a string

<script>
function convert(str){
  function conChar(ch){
   if (/[-_.!~*'()]/.test(ch)) return ch
   if (ch==' ') return '+'
   return escape(ch)
  }
strArr=str.split('')
for (i=0;i<str.length;i++)
  strArr[i]=conChar(strArr[i])
return strArr.join('')
}
alert(convert('%^& Hello World -_.!~*'))
</script>

Experts Exchange Solution brought to you by

Your issues matter to us.

Facing a tech roadblock? Get the help and guidance you need from experienced professionals who care. Ask your question anytime, anywhere, with no hassle.

Start your 7-day free trial
devicCommented:
hey Gwyn who said this?
devicCommented:
Gwyn, "hex" array is above:

for(var i=0;i<hex.length;i++)
document.write(unescape(hex[i]) + "<hr>")

i see inside: 0-9a-zA-z-_.!~*'()

and i thought that all chars must be converted

GwynforWebCommented:
>>"hey Gwyn who said this?"

look at the comment statements in the function in the question.  

( I gather you are doing another all nighter :-)  )
devicCommented:
ach, you are right, i didn't read good the question :)
GwynforWebCommented:
the above function return the 2 byte number for non ascii chars as well. Lets see if these show (probably not)

&#1081;&#1073;&#1083;&#1100; &#1081;&#1076;&#1073; &#1095;&#1083;&#1089; &#1079;&#1097;&#1074;&#1088;&#1094;&#1089; &#946;&#960;&#968;&#951;&#966; &#961;&#953;&#955;&#954; &#964;&#955;&#959;&#972; &#948;&#960;&#947;&#952; &#968;&#945;&#948;
GwynforWebCommented:
no they do not
SweatCoderAuthor Commented:
thanks for all the great feedback. the objective is to convert Japanese (Kanji) text into URL-legal text, and I was told it needed to be UTF-8 format. i'm running some tests based on your examples, and i'll get back to you tomorrow.

thanks!!
GwynforWebCommented:
the last function I posted will convert your Japanease text to its 2 byte escaped form
SweatCoderAuthor Commented:
gwyn, by your "last" do you mean:

<script>
function convert(str){
  function conChar(ch){
   if (/[-_.!~*'()]/.test(ch)) return ch
   if (ch==' ') return '+'
   return escape(ch)
  }
strArr=str.split('')
for (i=0;i<str.length;i++)
  strArr[i]=conChar(strArr[i])
return strArr.join('')
}
alert(convert('%^& Hello World -_.!~*'))
</script>

i'm getting different results than I got on my Java post here:
http://www.experts-exchange.com/Programming/Programming_Languages/Java/Q_21060607.html#11565712

. . .and i'm inclined to think that the solution offered in the link above is correct, but i want every perspective. your result is shorter than his. that doesn't mean anything of itself, but i'm just looking at different angles. the filename i'm working with is this:

AAAAAâKâCâhâuâbâN1.pdf  (looks different if Kanji is enabled on your OS, but this is the closest representation i guess)

yours returns: AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf
x4u at the Java post returns: AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf

yours just seems a little too short relative to all the chars that must be accounted for and encoded.

what do you think? it's also interesting to note as pointed out by x4u in the other post, that there's a ready-made jscript function called encodeURIComponent() that returns exactly the same result as the lengthy UTF function, but I can't use it because I need a server-side solution in my ASP code--serverside the function doesn't seem to work, it throws errors. So I use server-side javascript to test the various suggestions.

i'm interested to hear your thoughts.
GwynforWebCommented:
AAAAAâKâCâhâuâbâN1.pdf

no I think mine is right A K C h b N have not been coverted as required and the remaining 6 â's have been converted to %E2, count them and run this which converts back

<script>
     alert(unescape('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
     alert(unescape('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
</script>

time for bed, good night :-)
x4uCommented:
The difference here is that escape uses another encoding than utf-8. It encodes all ISO8859-1 characers (the lower 256 unicode chars) with a %xx sequence while it uses %uxxxx for all character codes >= 256. This behaviour is outdated and thats why there there is now encodeURIComponent.

Try this and you will see that new the 2nd one is right:
<script>
     alert(encodeURIComponent('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
     alert(encodeURIComponent('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
</script>
x4uCommented:
Ok, here is a litte downsized version, that creates the array with a loop and uses with GwynforWeb's regexp idea.

<html>
<head>
<!-- SAVE AS UTF-8!! (copy paste to notepad, save as, choose encoding:utf-8)
     or remove the meta header field!! -->

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Encoder Test</title></head>
<body>
<pre>
<script>
var digits ="0123456789abcdef";
var hex = new Array( 256 );
for( var idx = 0; idx < 256; idx++ )
    hex[ idx ] = "%" + digits.charAt( idx >> 4 ) + digits.charAt( idx & 0xf );

function encode( s )
{
    var sbuf = "";
    var len = s.length;
    for( var i = 0; i < len; i++ ) {
      var ch = s.charAt(i);
     
      if( /[A-Za-z0-9-_.!~*'()]/.test( ch ) ) {
        sbuf += ch;
      } else{
         var cc = s.charCodeAt(i);
         if (cc <= 0x007f) {          // other ASCII
            sbuf += hex[cc];
         } else if (cc <= 0x07FF) {          // non-ASCII <= 0x7FF
            sbuf += hex[0xc0 | (cc >> 6)]
                 + hex[0x80 | (cc & 0x3F)];
         } else {                         // 0x7FF < ch <= 0xFFFF
            sbuf += hex[0xe0 | (cc >> 12)]
                 + hex[0x80 | ((cc >> 6) & 0x3F)]
                 + hex[0x80 | (cc & 0x3F)];
         }
      }
    }
    return sbuf;
}

document.writeln( "encode: " + encode("Test ä123ß xyz-_+?") );
document.writeln( "encuri: " + encodeURIComponent("Test ä123ß xyz-_+?") );
document.writeln();
document.writeln( "unescape");
document.writeln( unescape('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
document.writeln( unescape('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
document.writeln( "decodeURIComponent");
// error: document.writeln( decodeURIComponent('AAAAA%E2K%E2C%E2h%E2u%E2b%E2N1.pdf'))
document.writeln( decodeURIComponent('AAAAA%c3%a2K%c3%a2C%c3%a2h%c3%a2u%c3%a2b%c3%a2N1.pdf'))
document.writeln();
document.writeln( "encode: " + encode("AAAAAâKâCâhâuâbâN1.pdf") );
document.writeln( "encuri: " + encodeURIComponent("AAAAAâKâCâhâuâbâN1.pdf") );
document.writeln();
document.writeln( "some text in russian:" );
document.writeln( "encode: " + encode("&#1069;&#1082;&#1089;&#1090;&#1088;&#1072;&#1089;&#1077;&#1085;&#1089;&#1099; &#1076;&#1077;&#1081;&#1089;&#1090;&#1074;&#1080;&#1090;&#1077;&#1083;&#1100;&#1085;&#1086;") );
document.writeln( "decode: " + decodeURIComponent( "%d0%ad%d0%ba%d1%81%d1%82%d1%80%d0%b0%d1%81%d0%b5%d0%bd%d1%81" +
                   "%d1%8b%20%d0%b4%d0%b5%d0%b9%d1%81%d1%82%d0%b2%d0%b8%d1%82%d0%b5%d0%bb%d1%8c%d0%bd%d0%be" ) );

</script>
</pre>
</body>
</html>
x4uCommented:
Well actually you don't need to save it as UTF-8 anymore as EE's server has converted the russian text into HTML character entities. But it still works.
SweatCoderAuthor Commented:
GwynforWeb, thanks for contributing to the more compact version of the final solution. and thanks x4u for the clarifications and revisions.
It's more than this solution.Get answers and train to solve all your tech problems - anytime, anywhere.Try it for free Edge Out The Competitionfor your dream job with proven skills and certifications.Get started today Stand Outas the employee with proven skills.Start learning today for free Move Your Career Forwardwith certification training in the latest technologies.Start your trial today
JavaScript

From novice to tech pro — start learning today.