package org.html5index.docscan; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringReader; import java.io.Writer; import java.net.URL; import java.net.URLConnection; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMResult; import javax.xml.transform.sax.SAXSource; import org.ccil.cowan.tagsoup.Parser; import org.w3c.dom.Document; import org.xml.sax.InputSource; public class DomLoader { public static BufferedReader openReader(String url) throws IOException { if (url.startsWith("/")) { InputStream inputStream = DomLoader.class.getResourceAsStream(url); return new BufferedReader(new InputStreamReader(inputStream, "utf-8")); } String cacheName = url.replace(":", "_").replace("/", "_2"); File cacheFile = new File("cache", cacheName); if (cacheFile.exists()) { return new BufferedReader(new InputStreamReader(new FileInputStream(cacheFile), "utf-8")); } URLConnection con = new URL(url).openConnection(); String contentType = con.getContentType(); String charSet = "ISO-8859-1"; if (contentType != null) { for (String part: contentType.split(";")) { part = part.trim(); if (part.startsWith("charset=")) { charSet = part.substring(8); break; } } } BufferedReader reader = new BufferedReader(new InputStreamReader(con.getInputStream(), charSet)); String text = loadText(reader); reader.close(); new File("cache").mkdir(); Writer writer = new OutputStreamWriter(new FileOutputStream(cacheFile), "utf-8"); writer.write(text); writer.close(); return new BufferedReader(new StringReader(text)); } static String loadText(String url) throws IOException { BufferedReader reader = openReader(url); String result = loadText(reader); reader.close(); return result; } static String loadText(BufferedReader reader) throws IOException { StringBuilder sb = new StringBuilder(); String line = reader.readLine(); if (line != null) { sb.append(line); while (true) { line = reader.readLine(); if (line == null) { break; } sb.append('\n'); sb.append(line); } } return sb.toString(); } public static Document loadDom(String url) { Parser parser = new Parser(); try { parser.setFeature(Parser.namespacesFeature, false); parser.setFeature(Parser.namespacePrefixesFeature, false); Reader reader = openReader(url); DOMResult result = new DOMResult(); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.transform(new SAXSource(parser, new InputSource(reader)), result); reader.close(); return (Document) result.getNode(); } catch (Exception e) { throw new RuntimeException(e); } } }