SweatCoder
asked on
UTF-8 encoding
there's a java class that does UTF-8 encoding, but I need it to work in javascript. can anyone convert the following so that it will work in a javascript function? i tried for about an hour, and i kept getting errors because i don't think js supports hex[] call nor append in the way it's being used, and eventually i gave up. I changed all the appends to +=, but I kept getting stuck on the hex stuff.
I got the code below from http://www.w3.org/International/O-URL-code.html
/**
* Provides a method to encode any string into a URL-safe
* form.
* Non-ASCII characters are first encoded as sequences of
* two or three bytes, using the UTF-8 algorithm, before being
* encoded as %HH escapes.
*/
public class URLUTF8Encoder
{
final static String[] hex = {
"%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
"%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
"%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
"%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
"%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
"%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
"%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
"%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
"%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
"%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
"%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
"%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
"%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
"%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
"%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
"%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
"%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
"%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
"%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
"%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
"%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
};
/**
* Encode a string to the "x-www-form-urlencoded" form, enhanced
* with the UTF-8-in-URL proposal. This is what happens:
*
* <ul>
* <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
* and '0' through '9' remain the same.
*
* <li><p>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
*
* <li><p>The space character ' ' is converted into a plus sign '+'.
*
* <li><p>All other ASCII characters are converted into the
* 3-character string "%xy", where xy is
* the two-digit hexadecimal representation of the character
* code
*
* <li><p>All non-ASCII characters are encoded in two steps: first
* to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
* secondly each of these bytes is encoded as "%xx".
* </ul>
*
* @param s The string to be encoded
* @return The encoded string
*/
public static String encode(String s)
{
StringBuffer sbuf = new StringBuffer();
int len = s.length();
for (int i = 0; i < len; i++) {
int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append('+');
} else if (ch == '-' || ch == '_' // unreserved
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')') {
sbuf.append((char)ch);
} else if (ch <= 0x007f) { // other ASCII
sbuf.append(hex[ch]);
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
sbuf.append(hex[0xc0 | (ch >> 6)]);
sbuf.append(hex[0x80 | (ch & 0x3F)]);
} else { // 0x7FF < ch <= 0xFFFF
sbuf.append(hex[0xe0 | (ch >> 12)]);
sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
sbuf.append(hex[0x80 | (ch & 0x3F)]);
}
}
return sbuf.toString();
}
}
I got the code below from http://www.w3.org/International/O-URL-code.html
/**
* Provides a method to encode any string into a URL-safe
* form.
* Non-ASCII characters are first encoded as sequences of
* two or three bytes, using the UTF-8 algorithm, before being
* encoded as %HH escapes.
*/
public class URLUTF8Encoder
{
final static String[] hex = {
"%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
"%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
"%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
"%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
"%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
"%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
"%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
"%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
"%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
"%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
"%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
"%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
"%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
"%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
"%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
"%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
"%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
"%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
"%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
"%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
"%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
};
/**
* Encode a string to the "x-www-form-urlencoded" form, enhanced
* with the UTF-8-in-URL proposal. This is what happens:
*
* <ul>
* <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
* and '0' through '9' remain the same.
*
* <li><p>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
*
* <li><p>The space character ' ' is converted into a plus sign '+'.
*
* <li><p>All other ASCII characters are converted into the
* 3-character string "%xy", where xy is
* the two-digit hexadecimal representation of the character
* code
*
* <li><p>All non-ASCII characters are encoded in two steps: first
* to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
* secondly each of these bytes is encoded as "%xx".
* </ul>
*
* @param s The string to be encoded
* @return The encoded string
*/
public static String encode(String s)
{
StringBuffer sbuf = new StringBuffer();
int len = s.length();
for (int i = 0; i < len; i++) {
int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append('+');
} else if (ch == '-' || ch == '_' // unreserved
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')') {
sbuf.append((char)ch);
} else if (ch <= 0x007f) { // other ASCII
sbuf.append(hex[ch]);
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
sbuf.append(hex[0xc0 | (ch >> 6)]);
sbuf.append(hex[0x80 | (ch & 0x3F)]);
} else { // 0x7FF < ch <= 0xFFFF
sbuf.append(hex[0xe0 | (ch >> 12)]);
sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
sbuf.append(hex[0x80 | (ch & 0x3F)]);
}
}
return sbuf.toString();
}
}
sorry meant to say some thing. I think the function you want is already in JS ie. escape() and unescape() will convert back
typos
<script>
alert(escape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
<script>
alert(escape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>
This is what you need
<script>
function convert(ch){
if (/[ - _ . ! ~ * ' ( )]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
alert('A '+convert('A'))
alert('a '+convert('a'))
alert(' '+convert(' '))
alert('( '+convert('('))
alert('& '+convert('&'))
alert('\\n '+convert('\n'))
alert('\\r '+convert('\r'))
</script>
<script>
function convert(ch){
if (/[ - _ . ! ~ * ' ( )]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
alert('A '+convert('A'))
alert('a '+convert('a'))
alert(' '+convert(' '))
alert('( '+convert('('))
alert('& '+convert('&'))
alert('\\n '+convert('\n'))
alert('\\r '+convert('\r'))
</script>
sorry this
function convert(ch){
if (/[-_.!~*'()]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
function convert(ch){
if (/[-_.!~*'()]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
<script>
var hex =[
"%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
"%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
"%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
"%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
"%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
"%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
"%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
"%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
"%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
"%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
"%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
"%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
"%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
"%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
"%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
"%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
"%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
"%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
"%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
"%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
"%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
];
function getChar(char)
{
for(var i=0;i<hex.length;i++)
{
if(unescape(hex[i])==char)
return hex[i];
}
return char;
}
function convert(str)
{
str=str.split("");
var newstr=[];
for(var i=0;i<str.length;i++)
{
newstr[i]=getChar(str[i]);
}
return newstr.join("");
}
var testString="Congratulation s! You have now earned a Brainbench Master Certification in JavaScript 1.5, which is valid for 3 years from today's date."
alert(testString+"==>\n" + convert(testString))
</script>
var hex =[
"%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
"%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
"%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
"%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
"%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
"%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
"%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
"%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
"%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
"%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
"%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
"%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
"%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
"%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
"%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
"%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
"%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
"%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
"%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
"%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
"%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
];
function getChar(char)
{
for(var i=0;i<hex.length;i++)
{
if(unescape(hex[i])==char)
return hex[i];
}
return char;
}
function convert(str)
{
str=str.split("");
var newstr=[];
for(var i=0;i<str.length;i++)
{
newstr[i]=getChar(str[i]);
}
return newstr.join("");
}
var testString="Congratulation
alert(testString+"==>\n" + convert(testString))
</script>
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
hey Gwyn who said this?
Gwyn, "hex" array is above:
for(var i=0;i<hex.length;i++)
document.write(unescape(he x[i]) + "<hr>")
i see inside: 0-9a-zA-z-_.!~*'()
and i thought that all chars must be converted
for(var i=0;i<hex.length;i++)
document.write(unescape(he
i see inside: 0-9a-zA-z-_.!~*'()
and i thought that all chars must be converted
>>"hey Gwyn who said this?"
look at the comment statements in the function in the question.
( I gather you are doing another all nighter :-) )
look at the comment statements in the function in the question.
( I gather you are doing another all nighter :-) )
ach, you are right, i didn't read good the question :)
the above function return the 2 byte number for non ascii chars as well. Lets see if these show (probably not)
йблn 0; йдб члс зщвl 8;ц& #1089; βπψη&# 966; ριλκ τλοό δπγθ ψαδ
йблn
no they do not
ASKER
thanks for all the great feedback. the objective is to convert Japanese (Kanji) text into URL-legal text, and I was told it needed to be UTF-8 format. i'm running some tests based on your examples, and i'll get back to you tomorrow.
thanks!!
thanks!!
the last function I posted will convert your Japanease text to its 2 byte escaped form
ASKER
gwyn, by your "last" do you mean:
<script>
function convert(str){
function conChar(ch){
if (/[-_.!~*'()]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
strArr=str.split('')
for (i=0;i<str.length;i++)
strArr[i]=conChar(strArr[i ])
return strArr.join('')
}
alert(convert('%^& Hello World -_.!~*'))
</script>
i'm getting different results than I got on my Java post here:
https://www.experts-exchange.com/questions/21060607/UTF-8-encoding-URL-safe.html#11565712
. . .and i'm inclined to think that the solution offered in the link above is correct, but i want every perspective. your result is shorter than his. that doesn't mean anything of itself, but i'm just looking at different angles. the filename i'm working with is this:
AAAAAâKâCâhâuâbâN1.pdf (looks different if Kanji is enabled on your OS, but this is the closest representation i guess)
yours returns: AAAAA%E2K%E2C%E2h%E2u%E2b% E2N1.pdf
x4u at the Java post returns: AAAAA%c3%a2K%c3%a2C%c3%a2h %c3%a2u%c3 %a2b%c3%a2 N1.pdf
yours just seems a little too short relative to all the chars that must be accounted for and encoded.
what do you think? it's also interesting to note as pointed out by x4u in the other post, that there's a ready-made jscript function called encodeURIComponent() that returns exactly the same result as the lengthy UTF function, but I can't use it because I need a server-side solution in my ASP code--serverside the function doesn't seem to work, it throws errors. So I use server-side javascript to test the various suggestions.
i'm interested to hear your thoughts.
<script>
function convert(str){
function conChar(ch){
if (/[-_.!~*'()]/.test(ch)) return ch
if (ch==' ') return '+'
return escape(ch)
}
strArr=str.split('')
for (i=0;i<str.length;i++)
strArr[i]=conChar(strArr[i
return strArr.join('')
}
alert(convert('%^& Hello World -_.!~*'))
</script>
i'm getting different results than I got on my Java post here:
https://www.experts-exchange.com/questions/21060607/UTF-8-encoding-URL-safe.html#11565712
. . .and i'm inclined to think that the solution offered in the link above is correct, but i want every perspective. your result is shorter than his. that doesn't mean anything of itself, but i'm just looking at different angles. the filename i'm working with is this:
AAAAAâKâCâhâuâbâN1.pdf (looks different if Kanji is enabled on your OS, but this is the closest representation i guess)
yours returns: AAAAA%E2K%E2C%E2h%E2u%E2b%
x4u at the Java post returns: AAAAA%c3%a2K%c3%a2C%c3%a2h
yours just seems a little too short relative to all the chars that must be accounted for and encoded.
what do you think? it's also interesting to note as pointed out by x4u in the other post, that there's a ready-made jscript function called encodeURIComponent() that returns exactly the same result as the lengthy UTF function, but I can't use it because I need a server-side solution in my ASP code--serverside the function doesn't seem to work, it throws errors. So I use server-side javascript to test the various suggestions.
i'm interested to hear your thoughts.
AAAAAâKâCâhâuâbâN1.pdf
no I think mine is right A K C h b N have not been coverted as required and the remaining 6 â's have been converted to %E2, count them and run this which converts back
<script>
alert(unescape('AAAAA%E2K% E2C%E2h%E2 u%E2b%E2N1 .pdf'))
alert(unescape('AAAAA%c3%a 2K%c3%a2C% c3%a2h%c3% a2u%c3%a2b %c3%a2N1.p df'))
</script>
time for bed, good night :-)
no I think mine is right A K C h b N have not been coverted as required and the remaining 6 â's have been converted to %E2, count them and run this which converts back
<script>
alert(unescape('AAAAA%E2K%
alert(unescape('AAAAA%c3%a
</script>
time for bed, good night :-)
The difference here is that escape uses another encoding than utf-8. It encodes all ISO8859-1 characers (the lower 256 unicode chars) with a %xx sequence while it uses %uxxxx for all character codes >= 256. This behaviour is outdated and thats why there there is now encodeURIComponent.
Try this and you will see that new the 2nd one is right:
<script>
alert(encodeURIComponent(' AAAAA%E2K% E2C%E2h%E2 u%E2b%E2N1 .pdf'))
alert(encodeURIComponent(' AAAAA%c3%a 2K%c3%a2C% c3%a2h%c3% a2u%c3%a2b %c3%a2N1.p df'))
</script>
Try this and you will see that new the 2nd one is right:
<script>
alert(encodeURIComponent('
alert(encodeURIComponent('
</script>
Ok, here is a litte downsized version, that creates the array with a loop and uses with GwynforWeb's regexp idea.
<html>
<head>
<!-- SAVE AS UTF-8!! (copy paste to notepad, save as, choose encoding:utf-8)
or remove the meta header field!! -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Encoder Test</title></head>
<body>
<pre>
<script>
var digits ="0123456789abcdef";
var hex = new Array( 256 );
for( var idx = 0; idx < 256; idx++ )
hex[ idx ] = "%" + digits.charAt( idx >> 4 ) + digits.charAt( idx & 0xf );
function encode( s )
{
var sbuf = "";
var len = s.length;
for( var i = 0; i < len; i++ ) {
var ch = s.charAt(i);
if( /[A-Za-z0-9-_.!~*'()]/.tes t( ch ) ) {
sbuf += ch;
} else{
var cc = s.charCodeAt(i);
if (cc <= 0x007f) { // other ASCII
sbuf += hex[cc];
} else if (cc <= 0x07FF) { // non-ASCII <= 0x7FF
sbuf += hex[0xc0 | (cc >> 6)]
+ hex[0x80 | (cc & 0x3F)];
} else { // 0x7FF < ch <= 0xFFFF
sbuf += hex[0xe0 | (cc >> 12)]
+ hex[0x80 | ((cc >> 6) & 0x3F)]
+ hex[0x80 | (cc & 0x3F)];
}
}
}
return sbuf;
}
document.writeln( "encode: " + encode("Test ä123ß xyz-_+?") );
document.writeln( "encuri: " + encodeURIComponent("Test ä123ß xyz-_+?") );
document.writeln();
document.writeln( "unescape");
document.writeln( unescape('AAAAA%E2K%E2C%E2 h%E2u%E2b% E2N1.pdf') )
document.writeln( unescape('AAAAA%c3%a2K%c3% a2C%c3%a2h %c3%a2u%c3 %a2b%c3%a2 N1.pdf'))
document.writeln( "decodeURIComponent");
// error: document.writeln( decodeURIComponent('AAAAA% E2K%E2C%E2 h%E2u%E2b% E2N1.pdf') )
document.writeln( decodeURIComponent('AAAAA% c3%a2K%c3% a2C%c3%a2h %c3%a2u%c3 %a2b%c3%a2 N1.pdf'))
document.writeln();
document.writeln( "encode: " + encode("AAAAAâKâCâhâuâbâN1 .pdf") );
document.writeln( "encuri: " + encodeURIComponent("AAAAAâ KâCâhâuâbâ N1.pdf") );
document.writeln();
document.writeln( "some text in russian:" );
document.writeln( "encode: " + encode("Эк
 89;т р 072;с ;е&# 1085;l 9;ы дейl 9;т& #1074;
 80;т е 083;ь ;н&# 1086;") );
document.writeln( "decode: " + decodeURIComponent( "%d0%ad%d0%ba%d1%81%d1%82% d1%80%d0%b 0%d1%81%d0 %b5%d0%bd% d1%81" +
"%d1%8b%20%d0%b4%d0%b5%d0% b9%d1%81%d 1%82%d0%b2 %d0%b8%d1% 82%d0%b5%d 0%bb%d1%8c %d0%bd%d0% be" ) );
</script>
</pre>
</body>
</html>
<html>
<head>
<!-- SAVE AS UTF-8!! (copy paste to notepad, save as, choose encoding:utf-8)
or remove the meta header field!! -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Encoder Test</title></head>
<body>
<pre>
<script>
var digits ="0123456789abcdef";
var hex = new Array( 256 );
for( var idx = 0; idx < 256; idx++ )
hex[ idx ] = "%" + digits.charAt( idx >> 4 ) + digits.charAt( idx & 0xf );
function encode( s )
{
var sbuf = "";
var len = s.length;
for( var i = 0; i < len; i++ ) {
var ch = s.charAt(i);
if( /[A-Za-z0-9-_.!~*'()]/.tes
sbuf += ch;
} else{
var cc = s.charCodeAt(i);
if (cc <= 0x007f) { // other ASCII
sbuf += hex[cc];
} else if (cc <= 0x07FF) { // non-ASCII <= 0x7FF
sbuf += hex[0xc0 | (cc >> 6)]
+ hex[0x80 | (cc & 0x3F)];
} else { // 0x7FF < ch <= 0xFFFF
sbuf += hex[0xe0 | (cc >> 12)]
+ hex[0x80 | ((cc >> 6) & 0x3F)]
+ hex[0x80 | (cc & 0x3F)];
}
}
}
return sbuf;
}
document.writeln( "encode: " + encode("Test ä123ß xyz-_+?") );
document.writeln( "encuri: " + encodeURIComponent("Test ä123ß xyz-_+?") );
document.writeln();
document.writeln( "unescape");
document.writeln( unescape('AAAAA%E2K%E2C%E2
document.writeln( unescape('AAAAA%c3%a2K%c3%
document.writeln( "decodeURIComponent");
// error: document.writeln( decodeURIComponent('AAAAA%
document.writeln( decodeURIComponent('AAAAA%
document.writeln();
document.writeln( "encode: " + encode("AAAAAâKâCâhâuâbâN1
document.writeln( "encuri: " + encodeURIComponent("AAAAAâ
document.writeln();
document.writeln( "some text in russian:" );
document.writeln( "encode: " + encode("Эк

document.writeln( "decode: " + decodeURIComponent( "%d0%ad%d0%ba%d1%81%d1%82%
"%d1%8b%20%d0%b4%d0%b5%d0%
</script>
</pre>
</body>
</html>
Well actually you don't need to save it as UTF-8 anymore as EE's server has converted the russian text into HTML character entities. But it still works.
ASKER
GwynforWeb, thanks for contributing to the more compact version of the final solution. and thanks x4u for the clarifications and revisions.
alert(excape(' '))
alert(escape('A'))
alert(escape('a'))
alert(escape('\n'))
</script>