main/src/cgeo/geocaching/utils/HtmlUtils.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

package cgeo.geocaching.utils;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;

import android.text.Spanned;
import android.text.style.ImageSpan;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

public class HtmlUtils {

    /**
     * Extract the text from a HTML based string. This is similar to what HTML.fromHtml(...) does, but this method also
     * removes the embedded images instead of replacing them by a small rectangular representation character.
     *
     * @param html
     * @return
     */
    public static String extractText(CharSequence html) {
        String result = html.toString();

        // recognize images in textview HTML contents
        if (html instanceof Spanned) {
            Spanned text = (Spanned) html;
            Object[] styles = text.getSpans(0, text.length(), Object.class);
            ArrayList<Pair<Integer, Integer>> removals = new ArrayList<Pair<Integer, Integer>>();
            for (Object style : styles) {
                if (style instanceof ImageSpan) {
                    int start = text.getSpanStart(style);
                    int end = text.getSpanEnd(style);
                    removals.add(Pair.of(start, end));
                }
            }

            // sort reversed and delete image spans
            Collections.sort(removals, new Comparator<Pair<Integer, Integer>>() {

                @Override
                public int compare(Pair<Integer, Integer> lhs, Pair<Integer, Integer> rhs) {
                    return rhs.getRight().compareTo(lhs.getRight());
                }
            });
            result = text.toString();
            for (Pair<Integer, Integer> removal : removals) {
                result = result.substring(0, removal.getLeft()) + result.substring(removal.getRight());
            }
        }

        // some line breaks are still in the text, source is unknown
        return StringUtils.replace(result, "<br />", "\n").trim();
    }

    /**
     * Convert any non-Latin characters into their HTML escaped equivalents
     * 
     * @param input
     *            String
     * @return output String
     */
    public static String convertNonLatinCharactersToHTML(final String input) {
        final int inputLen = input.length();
        final StringBuilder output = new StringBuilder();

        for (int i = 0; i < inputLen; i++) {
            char c = input.charAt(i);

            if (c > 300) {
                output.append("&#");
                output.append(Integer.toString(c));
                output.append(';');
            } else {
                output.append(c);
            }
        }

        return output.toString();
    }
}