blob: eea14c3e5d90b3725268fef68aaa23d534fa5b05 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
package cgeo.geocaching.files;
import org.apache.commons.lang3.StringUtils;
import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;
/**
* Filter reader which can filter out invalid XML characters and character references.
*
*/
public class InvalidXMLCharacterFilterReader extends FilterReader
{
public InvalidXMLCharacterFilterReader(final Reader in) {
super(in);
}
/**
* Every overload of {@link Reader#read()} method delegates to this one so
* it is enough to override only this one. <br />
* To skip invalid characters this method shifts only valid chars to left
* and returns decreased value of the original read method. So after last
* valid character there will be some unused chars in the buffer.
*
* @return Number of read valid characters or <code>-1</code> if end of the
* underling reader was reached.
*/
@Override
public int read(final char[] cbuf, final int off, final int len) throws IOException {
final int read = super.read(cbuf, off, len);
// check for end
if (read == -1) {
return -1;
}
// target position
int pos = off - 1;
int entityStart = -1;
for (int readPos = off; readPos < off + read; readPos++) {
boolean useChar = true;
switch (cbuf[readPos]) {
case '&':
pos++;
entityStart = readPos;
break;
case ';':
pos++;
if (entityStart >= 0) {
final int entityLength = readPos - entityStart + 1;
if (entityLength <= 5) {
final String entity = new String(cbuf, entityStart, entityLength);
if (StringUtils.startsWith(entity, "&#")) {
final String numberString = StringUtils.substringBetween(entity, "&#", ";");
final int value;
if (StringUtils.startsWith(numberString, "x")) {
value = Integer.parseInt(numberString.substring(1), 16);
}
else {
value = Integer.parseInt(numberString);
}
if (!isValidXMLChar((char) value)) {
pos -= entityLength;
useChar = false;
}
}
}
}
break;
default:
if (isValidXMLChar(cbuf[readPos])) {
pos++;
} else {
continue;
}
}
// copy, and skip unwanted characters
if (pos < readPos && useChar) {
cbuf[pos] = cbuf[readPos];
}
}
return pos - off + 1;
}
private static boolean isValidXMLChar(final char c) {
return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD);
}
}
|