aboutsummaryrefslogtreecommitdiffstats
path: root/src/net/java/sip/communicator/util/Html2Text.java
diff options
context:
space:
mode:
authorYana Stamcheva <yana@jitsi.org>2009-02-24 21:19:17 +0000
committerYana Stamcheva <yana@jitsi.org>2009-02-24 21:19:17 +0000
commit35f1ff2288a9388b869484c7958661b04c6f1ae2 (patch)
treebb726c0b6a72afd4042aede81fc8c2bf4d343f1f /src/net/java/sip/communicator/util/Html2Text.java
parent34209282eb100dc384b0a77b23701d3505adae7f (diff)
downloadjitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.zip
jitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.tar.gz
jitsi-35f1ff2288a9388b869484c7958661b04c6f1ae2.tar.bz2
Optimize HTML2Text to parse html directly through HTMLEditorKit, instead of creating an HTMLDocument.
Diffstat (limited to 'src/net/java/sip/communicator/util/Html2Text.java')
-rw-r--r--src/net/java/sip/communicator/util/Html2Text.java72
1 files changed, 57 insertions, 15 deletions
diff --git a/src/net/java/sip/communicator/util/Html2Text.java b/src/net/java/sip/communicator/util/Html2Text.java
index 8e778ea..4b8c34f 100644
--- a/src/net/java/sip/communicator/util/Html2Text.java
+++ b/src/net/java/sip/communicator/util/Html2Text.java
@@ -8,45 +8,87 @@
package net.java.sip.communicator.util;
import java.io.*;
+
import javax.swing.text.html.*;
-import javax.swing.text.*;
+import javax.swing.text.html.parser.*;
/**
* A utility class that allows to extract the text content of an html page
* stripped from all formatting tags.
*
* @author Emil Ivov <emcho at sip-communicator.org>
+ * @author Yana Stamcheva
*/
-public class Html2Text
+public class Html2Text
{
private static final Logger logger
= Logger.getLogger(Html2Text.class);
+
+ private static HTMLParserCallBack parser;
+
/**
- * The editor kit we use for conversions.
- */
- private HTMLEditorKit htmlEditorKit = new HTMLEditorKit();
-
- /**
- * A utility class that allows to extract the text content of an html page
+ * A utility method that allows to extract the text content of an html page
* stripped from all formatting tags. Method is synchronized to avoid
* concurrent access to the underlying html editor kit.
*
* @param html the html string that we will extract the text from.
* @return the text content of the <tt>html</tt> parameter.
*/
- public synchronized String extractText(String html)
+ public static synchronized String extractText(String html)
{
- Document doc = htmlEditorKit.createDefaultDocument();
-
+ if (parser == null)
+ parser = new HTMLParserCallBack();
+
try
{
- htmlEditorKit.read(new StringReader(html), doc, 0);
- return doc.getText(1, doc.getLength() - 1);
- }
+ StringReader in = new StringReader(html);
+ parser.parse(in);
+ in.close();
+
+ return parser.getText();
+ }
catch (Exception exc)
{
logger.info("Failed to extract plain text from html="+html, exc);
return html;
- }
+ }
+ }
+
+ /**
+ * The ParserCallback that will parse the html.
+ */
+ private static class HTMLParserCallBack extends HTMLEditorKit.ParserCallback
+ {
+ StringBuffer s;
+
+ /**
+ * Parses the text contained in the given reader.
+ *
+ * @param in the reader to parse.
+ * @throws IOException thrown if we fail to parse the reader.
+ */
+ public void parse(Reader in) throws IOException
+ {
+ s = new StringBuffer();
+ ParserDelegator delegator = new ParserDelegator();
+ // the third parameter is TRUE to ignore charset directive
+ delegator.parse(in, this, Boolean.TRUE);
+ }
+
+ /**
+ * Appends the given text to the string buffer.
+ */
+ public void handleText(char[] text, int pos)
+ {
+ s.append(text);
+ }
+
+ /**
+ * Returns the parsed text.
+ */
+ public String getText()
+ {
+ return s.toString();
+ }
}
}