diff options
author | Yana Stamcheva <yana@jitsi.org> | 2009-02-24 21:19:17 +0000 |
---|---|---|
committer | Yana Stamcheva <yana@jitsi.org> | 2009-02-24 21:19:17 +0000 |
commit | 35f1ff2288a9388b869484c7958661b04c6f1ae2 (patch) | |
tree | bb726c0b6a72afd4042aede81fc8c2bf4d343f1f /src/net/java/sip/communicator/util/Html2Text.java | |
parent | 34209282eb100dc384b0a77b23701d3505adae7f (diff) | |
download | jitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.zip jitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.tar.gz jitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.tar.bz2 |
Optimize HTML2Text to parse html directly through HTMLEditorKit, instead of creating an HTMLDocument.
Diffstat (limited to 'src/net/java/sip/communicator/util/Html2Text.java')
-rw-r--r-- | src/net/java/sip/communicator/util/Html2Text.java | 72 |
1 files changed, 57 insertions, 15 deletions
diff --git a/src/net/java/sip/communicator/util/Html2Text.java b/src/net/java/sip/communicator/util/Html2Text.java index 8e778ea..4b8c34f 100644 --- a/src/net/java/sip/communicator/util/Html2Text.java +++ b/src/net/java/sip/communicator/util/Html2Text.java @@ -8,45 +8,87 @@ package net.java.sip.communicator.util; import java.io.*; + import javax.swing.text.html.*; -import javax.swing.text.*; +import javax.swing.text.html.parser.*; /** * A utility class that allows to extract the text content of an html page * stripped from all formatting tags. * * @author Emil Ivov <emcho at sip-communicator.org> + * @author Yana Stamcheva */ -public class Html2Text +public class Html2Text { private static final Logger logger = Logger.getLogger(Html2Text.class); + + private static HTMLParserCallBack parser; + /** - * The editor kit we use for conversions. - */ - private HTMLEditorKit htmlEditorKit = new HTMLEditorKit(); - - /** - * A utility class that allows to extract the text content of an html page + * A utility method that allows to extract the text content of an html page * stripped from all formatting tags. Method is synchronized to avoid * concurrent access to the underlying html editor kit. * * @param html the html string that we will extract the text from. * @return the text content of the <tt>html</tt> parameter. */ - public synchronized String extractText(String html) + public static synchronized String extractText(String html) { - Document doc = htmlEditorKit.createDefaultDocument(); - + if (parser == null) + parser = new HTMLParserCallBack(); + try { - htmlEditorKit.read(new StringReader(html), doc, 0); - return doc.getText(1, doc.getLength() - 1); - } + StringReader in = new StringReader(html); + parser.parse(in); + in.close(); + + return parser.getText(); + } catch (Exception exc) { logger.info("Failed to extract plain text from html="+html, exc); return html; - } + } + } + + /** + * The ParserCallback that will parse the html. + */ + private static class HTMLParserCallBack extends HTMLEditorKit.ParserCallback + { + StringBuffer s; + + /** + * Parses the text contained in the given reader. + * + * @param in the reader to parse. + * @throws IOException thrown if we fail to parse the reader. + */ + public void parse(Reader in) throws IOException + { + s = new StringBuffer(); + ParserDelegator delegator = new ParserDelegator(); + // the third parameter is TRUE to ignore charset directive + delegator.parse(in, this, Boolean.TRUE); + } + + /** + * Appends the given text to the string buffer. + */ + public void handleText(char[] text, int pos) + { + s.append(text); + } + + /** + * Returns the parsed text. + */ + public String getText() + { + return s.toString(); + } } } |