From 42df35bc3f26e329ce4825c8773f18b62b97f8b7 Mon Sep 17 00:00:00 2001 From: Michael Keppler Date: Tue, 29 Apr 2014 09:51:09 +0200 Subject: fix #3795: filter invalid character references during import --- main/src/cgeo/geocaching/files/GPXParser.java | 6 +- .../files/InvalidXMLCharacterFilterReader.java | 98 ++++++++++++++++++++++ .../files/InvalidXMLCharacterFilterReaderTest.java | 28 +++++++ 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 main/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReader.java create mode 100644 tests/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReaderTest.java diff --git a/main/src/cgeo/geocaching/files/GPXParser.java b/main/src/cgeo/geocaching/files/GPXParser.java index 8d328e4..5553927 100644 --- a/main/src/cgeo/geocaching/files/GPXParser.java +++ b/main/src/cgeo/geocaching/files/GPXParser.java @@ -23,6 +23,7 @@ import cgeo.geocaching.utils.Log; import cgeo.geocaching.utils.MatcherWrapper; import cgeo.geocaching.utils.SynchronizedDateFormat; +import org.apache.commons.lang3.CharEncoding; import org.apache.commons.lang3.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -34,8 +35,10 @@ import android.sax.RootElement; import android.sax.StartElementListener; import android.util.Xml; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; @@ -809,7 +812,8 @@ public abstract class GPXParser extends FileParser { try { progressStream = new ProgressInputStream(stream); - Xml.parse(progressStream, Xml.Encoding.UTF_8, root.getContentHandler()); + BufferedReader reader = new BufferedReader(new InputStreamReader(progressStream, CharEncoding.UTF_8)); + Xml.parse(new InvalidXMLCharacterFilterReader(reader), root.getContentHandler()); return DataStore.loadCaches(result, EnumSet.of(LoadFlag.LOAD_DB_MINIMAL)); } catch (final SAXException e) { throw new ParserException("Cannot parse .gpx file as GPX " + version + ": could not parse XML", e); diff --git a/main/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReader.java b/main/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReader.java new file mode 100644 index 0000000..a7a3e1b --- /dev/null +++ b/main/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReader.java @@ -0,0 +1,98 @@ +package cgeo.geocaching.files; + +import org.apache.commons.lang3.StringUtils; + +import java.io.FilterReader; +import java.io.IOException; +import java.io.Reader; + +/** + * Filter reader which can filter out invalid XML characters and character references. + * + */ +public class InvalidXMLCharacterFilterReader extends FilterReader +{ + + public InvalidXMLCharacterFilterReader(Reader in) { + super(in); + } + + /** + * Every overload of {@link Reader#read()} method delegates to this one so + * it is enough to override only this one.
+ * To skip invalid characters this method shifts only valid chars to left + * and returns decreased value of the original read method. So after last + * valid character there will be some unused chars in the buffer. + * + * @return Number of read valid characters or -1 if end of the + * underling reader was reached. + */ + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int read = super.read(cbuf, off, len); + // check for end + if (read == -1) { + return -1; + } + // target position + int pos = off - 1; + + int entityStart = -1; + for (int readPos = off; readPos < off + read; readPos++) { + boolean useChar = true; + switch (cbuf[readPos]) { + case '&': + pos++; + entityStart = readPos; + break; + case ';': + pos++; + if (entityStart >= 0) { + int entityLength = readPos - entityStart + 1; + if (entityLength <= 5) { + String entity = new String(cbuf, entityStart, entityLength); + if (StringUtils.startsWith(entity, "&#")) { + String numberString = StringUtils.substringBetween(entity, "&#", ";"); + final int value; + if (StringUtils.startsWith(numberString, "x")) { + value = Integer.parseInt(numberString.substring(1), 16); + } + else { + value = Integer.parseInt(numberString); + } + if (!isValidXMLChar((char) value)) { + pos -= entityLength; + useChar = false; + } + } + } + } + break; + default: + if (isValidXMLChar(cbuf[readPos])) { + pos++; + } else { + continue; + } + } + // copy, and skip unwanted characters + if (pos < readPos && useChar) { + cbuf[pos] = cbuf[readPos]; + } + } + return pos - off + 1; + } + + private static boolean isValidXMLChar(char c) { + if ((c == 0x9) || + (c == 0xA) || + (c == 0xD) || + ((c >= 0x20) && (c <= 0xD7FF)) || + ((c >= 0xE000) && (c <= 0xFFFD)) || + ((c >= 0x10000) && (c <= 0x10FFFF))) + { + return true; + } + return false; + } +} \ No newline at end of file diff --git a/tests/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReaderTest.java b/tests/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReaderTest.java new file mode 100644 index 0000000..0641b5d --- /dev/null +++ b/tests/src/cgeo/geocaching/files/InvalidXMLCharacterFilterReaderTest.java @@ -0,0 +1,28 @@ +package cgeo.geocaching.files; + +import static org.assertj.core.api.Assertions.assertThat; + +import android.sax.EndTextElementListener; +import android.sax.RootElement; +import android.test.AndroidTestCase; +import android.util.Xml; + +import java.io.StringReader; +import java.util.concurrent.atomic.AtomicReference; + +public class InvalidXMLCharacterFilterReaderTest extends AndroidTestCase { + + public static void testFilterInvalid() throws Exception { + final RootElement root = new RootElement("desc"); + final AtomicReference description = new AtomicReference(); + root.setEndTextElementListener(new EndTextElementListener() { + + public void end(String body) { + description.set(body); + } + }); + StringReader reader = new StringReader("Invalid description"); + Xml.parse(new InvalidXMLCharacterFilterReader(reader), root.getContentHandler()); + assertThat(description.get()).isEqualTo("Invaliddescription"); + } +} -- cgit v1.1