arichexe
asked on
URL stream to xhtml
How would I modify the below to convert a URL stream into an xhtml string, rather than an html file into an xhtml file?
import org.w3c.tidy.Tidy;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.w3c.dom.Document;
public class HTML_to_XHTML{
public static void main(String[] args){
try{
FileInputStream FIS=new FileInputStream("C://test.html");
FileOutputStream FOS=new FileOutputStream("C://testXHTML.xml");
Tidy T=new Tidy();
Document D=T.parseDOM(FIS,FOS);
}
catch (java.io.FileNotFoundException e)
{System.out.println(e.getMessage());}
}
}
}
InputStream FIS=url.getInputStream();
StringWriter FOS=new StringWriter();
Tidy T=new Tidy();
T.parseDOM(FIS,FOS);
String xhtml = FOS.toString();
StringWriter FOS=new StringWriter();
Tidy T=new Tidy();
T.parseDOM(FIS,FOS);
String xhtml = FOS.toString();
ASKER
I'm getting a "Tidy cannot be resolved to a type" error.
<%@ page import="java.io.*,java.net.*,java.text.*,java.util.*,javax.xml.parsers.*,javax.xml.xpath.*,org.w3c.dom.*,org.w3c.dom.*,org.xml.sax.*,org.w3c.tidy.*" %>
<%
URL url = new URL(MyUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type","text/xml");
conn.setDoOutput(true);
OutputStream os = conn.getOutputStream();
os.flush();
os.close();
InputStream is = conn.getInputStream();
StringWriter ox = new StringWriter();
Tidy T=new Tidy();
T.parseDOM(is,ox);
String xhtml = ox.toString();
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setValidating(false);
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(xhtml)));
document.getDocumentElement().normalize();
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xpath.evaluate("//title/text()",document,XPathConstants.NODESET);
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
out.print("msg: " + nodeList.item(i).toString());
}
}else{
out.print("msg: not found");
}
%>
make sure you have the tidy jar in your webapps lib directory
you should also be able to simplify your code to the following
InputStream is = conn.getInputStream();
StringWriter ox = new StringWriter();
Tidy T=new Tidy();
Document document = T.parseDOM(is,ox);
String xhtml = ox.toString();
document.getDocumentElemen t().normal ize();
XPath xpath = XPathFactory.newInstance() .newXPath( );
NodeList nodeList = (NodeList) xpath.evaluate("//title/te xt()",docu ment,XPath Constants. NODESET);
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
out.print("msg: " + nodeList.item(i).toString( ));
}
}else{
out.print("msg: not found");
}
InputStream is = conn.getInputStream();
StringWriter ox = new StringWriter();
Tidy T=new Tidy();
Document document = T.parseDOM(is,ox);
String xhtml = ox.toString();
document.getDocumentElemen
XPath xpath = XPathFactory.newInstance()
NodeList nodeList = (NodeList) xpath.evaluate("//title/te
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
out.print("msg: " + nodeList.item(i).toString(
}
}else{
out.print("msg: not found");
}
ASKER
Now I'm getting "The method parseDOM(InputStream, OutputStream) in the type Tidy is not applicable for the arguments (InputStream, StringWriter)."
<%@ page import="java.io.*,java.net.*,java.text.*,java.util.*,javax.xml.parsers.*,javax.xml.xpath.*,org.w3c.dom.*,org.w3c.dom.*,org.w3c.tidy.*,org.xml.sax.*" %>
<%
URL url = new URL(MyUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type","text/html");
conn.setDoOutput(true);
OutputStream os = conn.getOutputStream();
os.flush();
os.close();
InputStream is = conn.getInputStream();
StringWriter ox = new StringWriter();
Tidy T=new Tidy();
Document document = T.parseDOM(is,ox);
String xhtml = ox.toString();
document.getDocumentElement().normalize();
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xpath.evaluate("//title/text()",document,XPathConstants.NODESET);
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
out.print("msg: " + nodeList.item(i).toString());
}
}else{
out.print("msg: not found");
}
%>
you don't actually need to create the string at all, try this:
InputStream is = conn.getInputStream();
Tidy T=new Tidy();
Document document = T.parseDOM(is, null);
document.getDocumentElemen t().normal ize();
XPath xpath = XPathFactory.newInstance() .newXPath( );
NodeList nodeList = (NodeList) xpath.evaluate("//title/te xt()",docu ment,XPath Constants. NODESET);
InputStream is = conn.getInputStream();
Tidy T=new Tidy();
Document document = T.parseDOM(is, null);
document.getDocumentElemen
XPath xpath = XPathFactory.newInstance()
NodeList nodeList = (NodeList) xpath.evaluate("//title/te
ASKER
Now it returns this weird string "msg: org.w3c.tidy.DOMTextImpl@9 674b2d" and the last 7 chars change when I hit refresh. No error, though. Strange.
<%@ page import="java.io.*,java.net.*,java.text.*,java.util.*,javax.xml.parsers.*,javax.xml.xpath.*,org.w3c.dom.*,org.w3c.dom.*,org.w3c.tidy.*,org.xml.sax.*" %>
<%
URL url = new URL("http://MyUrl.com");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStream is = conn.getInputStream();
Tidy T=new Tidy();
Document document = T.parseDOM(is,null);
document.getDocumentElement().normalize();
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xpath.evaluate("//title/text()",document,XPathConstants.NODESET);
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
out.print("msg: " + nodeList.item(i).toString());
}
}else{
out.print("msg: not found");
}
%>
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
ASKER
Thanks!
http://www.exampledepot.com/egs/javax.xml.transform/WriteDom.html