istiaquem
asked on
Java xml parse error: Special characters
I have this issue lingering for a while. I have an xml which has special characters and I am trying to parse them and have serious problem. Experts please advice.
Here is the xml
<?xml version="1.0" encoding="UTF-8"?>
<user_data>
<time_taken>ÀÀÀÀ</time_tak en> ///SPECIAL CHARACTERS.
</user_data>
Here is my servlet which parses:
protected ModelAndView handleRequestInternal(Http ServletReq uest request, HttpServletResponse response) throws Exception
{
request.setCharacterEncodi ng("UTF-8" );
int contentLength = request.getContentLength() ;
if ( contentLength == -1 ) {
// Content length must be known.
throw new ServletException( "Content-Length must be specified" );
}
String contentType = request.getContentType();
System.out.println("reques t.getConte ntType(): " +request.getContentType() );
System.out.println("reques t.getConte ntLength() : " +request.getContentLength( ) );
boolean contentTypeIsOkay = false;
// Content-Type must be specified.
if ( contentType != null ) {
// The type must be plain text.
if ( contentType.startsWith( "text/xml" ) ) {
// And it must be UTF-8 encoded (or unspecified, in which case
// we assume
// that it's either UTF-8 or ASCII).
if ( contentType.indexOf( "charset=" ) == -1 ) {
contentTypeIsOkay = true;
} else if ( contentType.indexOf( "charset=utf-8" ) != -1 ) {
contentTypeIsOkay = true;
}
}
}
if ( !contentTypeIsOkay ) {
throw new ServletException(
"Content-Type must be 'text/xml' with 'charset=utf-8' (or unspecified charset)" );
}
InputStream in = request.getInputStream();
// InputStreamReader in = new InputStreamReader(request. getInputSt ream(), "UTF-8");
String decoded = null;
String pay = null;
try {
byte[] payload = new byte[contentLength];
int offset = 0;
int len = contentLength;
int byteCount;
while ( offset < contentLength ) {
byteCount = in.read( payload, offset, len );
if ( byteCount == -1 ) {
throw new ServletException( "Client did not send " + contentLength + " bytes as expected" );
}
offset += byteCount;
len -= byteCount;
}
pay = new String( payload, "UTF-8" );
System.out.println("xml is : " +pay );
decoded = URLDecoder.decode(pay, "utf-8");
System.out.println("decode d : " +decoded );
} finally {
if ( in != null ) {
in.close();
}
}
sun.io.ByteToCharConverter fromUnicode;
String convertedStr = decoded;
try {
fromUnicode = sun.io.ByteToCharConverter .getConver ter("UTF-8 ");
fromUnicode.setSubstitutio nMode(true );
char[] convertedChars;
convertedChars = fromUnicode.convertAll(con vertedStr. getBytes() );
convertedStr = new String(convertedChars);
System.out.println("conver tedStr : " +convertedStr );
} catch (UnsupportedEncodingExcept ion e) {
e.printStackTrace();
}
InputStream inputStream = request.getInputStream();
System.out.println("reques t.getChara cterEncodi ng() : " + request.getCharacterEncodi ng() );
SAXBuilder builder = null;
// Create an instance of the tester and test
builder = new SAXBuilder();
Document doc= builder.build(new java.io.ByteArrayInputStre am(convert edStr.getB ytes()));
//////ERROR : Illegal XML character: .
Element user_data =doc.getRootElement();
Here is the xml
<?xml version="1.0" encoding="UTF-8"?>
<user_data>
<time_taken>ÀÀÀÀ</time_tak
</user_data>
Here is my servlet which parses:
protected ModelAndView handleRequestInternal(Http
{
request.setCharacterEncodi
int contentLength = request.getContentLength()
if ( contentLength == -1 ) {
// Content length must be known.
throw new ServletException( "Content-Length must be specified" );
}
String contentType = request.getContentType();
System.out.println("reques
System.out.println("reques
boolean contentTypeIsOkay = false;
// Content-Type must be specified.
if ( contentType != null ) {
// The type must be plain text.
if ( contentType.startsWith( "text/xml" ) ) {
// And it must be UTF-8 encoded (or unspecified, in which case
// we assume
// that it's either UTF-8 or ASCII).
if ( contentType.indexOf( "charset=" ) == -1 ) {
contentTypeIsOkay = true;
} else if ( contentType.indexOf( "charset=utf-8" ) != -1 ) {
contentTypeIsOkay = true;
}
}
}
if ( !contentTypeIsOkay ) {
throw new ServletException(
"Content-Type must be 'text/xml' with 'charset=utf-8' (or unspecified charset)" );
}
InputStream in = request.getInputStream();
// InputStreamReader in = new InputStreamReader(request.
String decoded = null;
String pay = null;
try {
byte[] payload = new byte[contentLength];
int offset = 0;
int len = contentLength;
int byteCount;
while ( offset < contentLength ) {
byteCount = in.read( payload, offset, len );
if ( byteCount == -1 ) {
throw new ServletException( "Client did not send " + contentLength + " bytes as expected" );
}
offset += byteCount;
len -= byteCount;
}
pay = new String( payload, "UTF-8" );
System.out.println("xml is : " +pay );
decoded = URLDecoder.decode(pay, "utf-8");
System.out.println("decode
} finally {
if ( in != null ) {
in.close();
}
}
sun.io.ByteToCharConverter
String convertedStr = decoded;
try {
fromUnicode = sun.io.ByteToCharConverter
fromUnicode.setSubstitutio
char[] convertedChars;
convertedChars = fromUnicode.convertAll(con
convertedStr = new String(convertedChars);
System.out.println("conver
} catch (UnsupportedEncodingExcept
e.printStackTrace();
}
InputStream inputStream = request.getInputStream();
System.out.println("reques
SAXBuilder builder = null;
// Create an instance of the tester and test
builder = new SAXBuilder();
Document doc= builder.build(new java.io.ByteArrayInputStre
//////ERROR : Illegal XML character: .
Element user_data =doc.getRootElement();
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
In case links aren't working -- the suggestion is to try character encoding ISO-8859-1.