diff options
| -rw-r--r-- | main/src/cgeo/geocaching/Constants.java | 53 | ||||
| -rw-r--r-- | main/src/cgeo/geocaching/cgBase.java | 69 | ||||
| -rw-r--r-- | main/src/cgeo/geocaching/utils/BaseUtils.java | 48 | ||||
| -rw-r--r-- | tests/src/cgeo/geocaching/test/RegExPerformanceTest.java | 104 | ||||
| -rw-r--r-- | tests/src/cgeo/geocaching/test/mock/MockedCache.java | 5 |
5 files changed, 224 insertions, 55 deletions
diff --git a/main/src/cgeo/geocaching/Constants.java b/main/src/cgeo/geocaching/Constants.java new file mode 100644 index 0000000..3c2f74a --- /dev/null +++ b/main/src/cgeo/geocaching/Constants.java @@ -0,0 +1,53 @@ +package cgeo.geocaching; + +import java.util.regex.Pattern; + +public class Constants { + + /** + * For further information about patters have a look at + * http://download.oracle.com/javase/1.4.2/docs/api/java/util/regex/Pattern.html + */ + + /** Search until the start of the next tag. The tag can follow immediately */ + public static final String NEXT_START_TAG = "[^<]*"; + /** Search until the end of the actual tag. The closing tag can follow immediately */ + public static final String NEXT_END_TAG = "[^>]*"; + + /** Search until the start of the next tag. The tag must not follow immediately */ + public static final String NEXT_START_TAG2 = "[^<]+"; + /** Search until the end of the actual tag. The closing tag must not follow immediately */ + public static final String NEXT_END_TAG2 = "[^>]+"; + + /** P tag */ + public static final String TAG_P_START = "<p>"; + /** Closing P tag **/ + public static final String TAG_P_END = "</p>"; + /** Search until the next <p> */ + public static final String TAG_P_START_NEXT = NEXT_START_TAG + TAG_P_START; + /** Search until the next </p> */ + public static final String TAG_P_END_NEXT = NEXT_START_TAG + TAG_P_END; + + /** strong tag */ + public static final String TAG_STRONG_START = "<strong>"; + /** Closing strong tag */ + public static final String TAG_STRONG_END = "</strong>"; + /** Search until the next <strong> */ + public static final String TAG_STRONG_START_NEXT = NEXT_START_TAG + TAG_STRONG_START; + /** Search until the next </strong> */ + public static final String TAG_STRONG_END_NEXT = NEXT_START_TAG + TAG_STRONG_END; + + /** div tag */ + public static final String TAG_DIV_START = "<div>"; + /** closing div tag */ + public static final String TAG_DIV_END = "</div>"; + /** Search until the next <div> */ + public static final String TAG_DIV_START_NEXT = NEXT_START_TAG + TAG_DIV_START; + /** Search until the next </div> */ + public static final String TAG_DIV_END_NEXT = NEXT_START_TAG + TAG_DIV_END; + + public final static Pattern PATTERN_HINT = Pattern.compile("Additional Hints" + Constants.TAG_STRONG_END + "[^\\(]*\\(<a" + Constants.NEXT_END_TAG2 + ">Encrypt</a>\\)" + Constants.TAG_P_END + + Constants.NEXT_START_TAG + "<div id=\"div_hint\"" + Constants.NEXT_END_TAG + ">(.*?)" + Constants.TAG_DIV_END + Constants.NEXT_START_TAG + "<div id='dk'"); + public final static Pattern PATTERN_DESC = Pattern.compile("<span id=\"ctl00_ContentBody_LongDescription\">(.*?)</span>" + Constants.TAG_DIV_END_NEXT + Constants.TAG_P_START_NEXT + Constants.TAG_P_END_NEXT + Constants.TAG_P_START_NEXT + Constants.TAG_STRONG_START_NEXT + "\\W*Additional Hints" + Constants.TAG_STRONG_END); + +} diff --git a/main/src/cgeo/geocaching/cgBase.java b/main/src/cgeo/geocaching/cgBase.java index 328a4d4..f8e0c24 100644 --- a/main/src/cgeo/geocaching/cgBase.java +++ b/main/src/cgeo/geocaching/cgBase.java @@ -8,6 +8,7 @@ import cgeo.geocaching.enumerations.WaypointType; import cgeo.geocaching.files.LocParser; import cgeo.geocaching.geopoint.DistanceParser; import cgeo.geocaching.geopoint.Geopoint; +import cgeo.geocaching.utils.BaseUtils; import cgeo.geocaching.utils.CollectionUtils; import org.apache.commons.lang3.ArrayUtils; @@ -100,10 +101,8 @@ public class cgBase { private final static Pattern patternFoundAlternative = Pattern.compile("<div class=\"StatusInformationWidget FavoriteWidget\"", Pattern.CASE_INSENSITIVE); private final static Pattern patternLatLon = Pattern.compile("<span id=\"ctl00_ContentBody_LatLon\"[^>]*>(<b>)?([^<]*)(<\\/b>)?<\\/span>", Pattern.CASE_INSENSITIVE); private final static Pattern patternLocation = Pattern.compile("<span id=\"ctl00_ContentBody_Location\"[^>]*>In ([^<]*)", Pattern.CASE_INSENSITIVE); - private final static Pattern patternHint = Pattern.compile("<div id=\"div_hint\"[^>]*>(.*?)</div>", Pattern.CASE_INSENSITIVE); private final static Pattern patternPersonalNote = Pattern.compile("<p id=\"cache_note\"[^>]*>([^<]*)</p>", Pattern.CASE_INSENSITIVE); private final static Pattern patternDescShort = Pattern.compile("<div class=\"UserSuppliedContent\">[^<]*<span id=\"ctl00_ContentBody_ShortDescription\"[^>]*>((?:(?!</span>[^\\w^<]*</div>).)*)</span>[^\\w^<]*</div>", Pattern.CASE_INSENSITIVE); - private final static Pattern patternDesc = Pattern.compile("<span id=\"ctl00_ContentBody_LongDescription\"[^>]*>" + "(.*)</span>[^<]*</div>[^<]*<p>[^<]*</p>[^<]*<p>[^<]*<strong>\\W*Additional Hints</strong>", Pattern.CASE_INSENSITIVE); private final static Pattern patternCountLogs = Pattern.compile("<span id=\"ctl00_ContentBody_lblFindCounts\"><p(.+?)<\\/p><\\/span>", Pattern.CASE_INSENSITIVE); private final static Pattern patternCountLog = Pattern.compile("src=\"\\/images\\/icons\\/(.+?).gif\"[^>]+> (\\d*[,.]?\\d+)", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); private final static Pattern patternAttributes = Pattern.compile("<h3 class=\"WidgetHeader\">[^<]*<img[^>]+>\\W*Attributes[^<]*</h3>[^<]*<div class=\"WidgetBody\">(([^<]*<img src=\"[^\"]+\" alt=\"[^\"]+\"[^>]*>)+)[^<]*<p", Pattern.CASE_INSENSITIVE); @@ -1091,7 +1090,7 @@ public class cgBase { try { final Matcher matcherGeocode = patternGeocode.matcher(page); if (matcherGeocode.find() && matcherGeocode.groupCount() > 0) { - cache.geocode = getMatch(matcherGeocode.group(1)); + cache.geocode = BaseUtils.getMatch(matcherGeocode.group(1)); } } catch (Exception e) { // failed to parse cache geocode @@ -1102,7 +1101,7 @@ public class cgBase { try { final Matcher matcherCacheId = patternCacheId.matcher(page); if (matcherCacheId.find() && matcherCacheId.groupCount() > 0) { - cache.cacheId = getMatch(matcherCacheId.group(1)); + cache.cacheId = BaseUtils.getMatch(matcherCacheId.group(1)); } } catch (Exception e) { // failed to parse cache id @@ -1113,7 +1112,7 @@ public class cgBase { try { final Matcher matcherCacheGuid = patternCacheGuid.matcher(page); if (matcherCacheGuid.find() && matcherCacheGuid.groupCount() > 0) { - cache.guid = getMatch(matcherCacheGuid.group(1)); + cache.guid = BaseUtils.getMatch(matcherCacheGuid.group(1)); } } catch (Exception e) { // failed to parse cache guid @@ -1239,7 +1238,7 @@ public class cgBase { try { final Matcher matcherSize = patternSize.matcher(tableInside); if (matcherSize.find() && matcherSize.groupCount() > 0) { - cache.size = CacheSize.FIND_BY_ID.get(getMatch(matcherSize.group(1)).toLowerCase()); + cache.size = CacheSize.FIND_BY_ID.get(BaseUtils.getMatch(matcherSize.group(1)).toLowerCase()); } } catch (Exception e) { // failed to parse size @@ -1274,7 +1273,7 @@ public class cgBase { try { final Matcher matcherLatLon = patternLatLon.matcher(page); if (matcherLatLon.find() && matcherLatLon.groupCount() > 0) { - cache.latlon = getMatch(matcherLatLon.group(2)); // first is <b> + cache.latlon = BaseUtils.getMatch(matcherLatLon.group(2)); // first is <b> Map<String, Object> tmp = cgBase.parseLatlon(cache.latlon); if (tmp.size() > 0) { @@ -1294,7 +1293,7 @@ public class cgBase { try { final Matcher matcherLocation = patternLocation.matcher(page); if (matcherLocation.find() && matcherLocation.groupCount() > 0) { - cache.location = getMatch(matcherLocation.group(1)); + cache.location = BaseUtils.getMatch(matcherLocation.group(1)); } } catch (Exception e) { // failed to parse location @@ -1303,7 +1302,7 @@ public class cgBase { // cache hint try { - final Matcher matcherHint = patternHint.matcher(page); + final Matcher matcherHint = Constants.PATTERN_HINT.matcher(page); if (matcherHint.find() && matcherHint.group(1) != null) { // replace linebreak and paragraph tags String hint = Pattern.compile("<(br|p)[^>]*>").matcher(matcherHint.group(1)).replaceAll("\n"); @@ -1346,7 +1345,7 @@ public class cgBase { try { final Matcher matcherPersonalNote = patternPersonalNote.matcher(page); if (matcherPersonalNote.find() && matcherPersonalNote.groupCount() > 0) { - cache.personalNote = getMatch(matcherPersonalNote.group(1)); + cache.personalNote = BaseUtils.getMatch(matcherPersonalNote.group(1)); } } catch (Exception e) { // failed to parse cache personal note @@ -1357,7 +1356,7 @@ public class cgBase { try { final Matcher matcherDescShort = patternDescShort.matcher(page); if (matcherDescShort.find() && matcherDescShort.groupCount() > 0) { - cache.shortdesc = getMatch(matcherDescShort.group(1)); + cache.shortdesc = BaseUtils.getMatch(matcherDescShort.group(1)); } } catch (Exception e) { // failed to parse short description @@ -1366,9 +1365,9 @@ public class cgBase { // cache description try { - final Matcher matcherDesc = patternDesc.matcher(page); + final Matcher matcherDesc = Constants.PATTERN_DESC.matcher(page); if (matcherDesc.find() && matcherDesc.groupCount() > 0) { - cache.description = getMatch(matcherDesc.group(1)); + cache.description = BaseUtils.getMatch(matcherDesc.group(1)); } } catch (Exception e) { // failed to parse short description @@ -1775,16 +1774,6 @@ public class cgBase { } } - private static String getMatch(String match) { - // creating a new String via String constructor is necessary here!! - return new String(match.trim()); - // Java copies the whole page String, when matching with regular expressions - // later this would block the garbage collector, as we only need tiny parts of the page - // see http://developer.android.com/reference/java/lang/String.html#backing_array - - // And BTW: You cannot even see that effect in the debugger, but must use a separate memory profiler! - } - public Date parseGcCustomDate(String input) throws ParseException { @@ -3745,7 +3734,7 @@ public class cgBase { "GET", new HashMap<String, String>(), requestId, false, false, false); } else { if (StringUtils.isNotEmpty(buffer)) { - replaceWhitespace(buffer); + BaseUtils.replaceWhitespace(buffer); String data = buffer.toString(); buffer = null; @@ -3766,32 +3755,6 @@ public class cgBase { return response; } - /** - * Replace the characters \n, \r and \t with a space - * - * @param buffer - * The data - */ - public static void replaceWhitespace(final StringBuffer buffer) { - final int length = buffer.length(); - final char[] chars = new char[length]; - buffer.getChars(0, length, chars, 0); - int resultSize = 0; - boolean lastWasWhitespace = false; - for (char c : chars) { - if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { - if (!lastWasWhitespace) { - chars[resultSize++] = ' '; - } - lastWasWhitespace = true; - } else { - chars[resultSize++] = c; - lastWasWhitespace = false; - } - } - buffer.setLength(0); - buffer.append(chars); - } public String requestJSONgc(final URI uri, String params) { int httpCode = -1; @@ -3879,7 +3842,7 @@ public class cgBase { final URI newLocation = uri.resolve(httpLocation); page = requestJSONgc(newLocation, params); } else { - replaceWhitespace(buffer); + BaseUtils.replaceWhitespace(buffer); page = buffer.toString(); } @@ -4029,7 +3992,7 @@ public class cgBase { * } * } else { */ - replaceWhitespace(buffer); + BaseUtils.replaceWhitespace(buffer); page = buffer.toString(); //} @@ -4604,7 +4567,7 @@ public class cgBase { /** * Generate a numeric date and time string according to system-wide settings (locale, * date format) such as "7 sept. à 12:35". - * + * * @param context * a Context * @param date diff --git a/main/src/cgeo/geocaching/utils/BaseUtils.java b/main/src/cgeo/geocaching/utils/BaseUtils.java new file mode 100644 index 0000000..2025765 --- /dev/null +++ b/main/src/cgeo/geocaching/utils/BaseUtils.java @@ -0,0 +1,48 @@ +/** + * + */ +package cgeo.geocaching.utils; + +/** + * Misc. utils + */ +public final class BaseUtils { + + /** + * Replace the characters \n, \r and \t with a space + * + * @param buffer + * The data + */ + public static void replaceWhitespace(final StringBuffer buffer) { + final int length = buffer.length(); + final char[] chars = new char[length]; + buffer.getChars(0, length, chars, 0); + int resultSize = 0; + boolean lastWasWhitespace = false; + for (char c : chars) { + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + if (!lastWasWhitespace) { + chars[resultSize++] = ' '; + } + lastWasWhitespace = true; + } else { + chars[resultSize++] = c; + lastWasWhitespace = false; + } + } + buffer.setLength(0); + buffer.append(chars); + } + + public static String getMatch(String match) { + // creating a new String via String constructor is necessary here!! + return new String(match.trim()); + // Java copies the whole page String, when matching with regular expressions + // later this would block the garbage collector, as we only need tiny parts of the page + // see http://developer.android.com/reference/java/lang/String.html#backing_array + + // And BTW: You cannot even see that effect in the debugger, but must use a separate memory profiler! + } + +} diff --git a/tests/src/cgeo/geocaching/test/RegExPerformanceTest.java b/tests/src/cgeo/geocaching/test/RegExPerformanceTest.java new file mode 100644 index 0000000..277ec32 --- /dev/null +++ b/tests/src/cgeo/geocaching/test/RegExPerformanceTest.java @@ -0,0 +1,104 @@ +package cgeo.geocaching.test; + +import cgeo.geocaching.Constants; +import cgeo.geocaching.test.mock.GC1ZXX2; +import cgeo.geocaching.test.mock.GC2CJPF; +import cgeo.geocaching.test.mock.MockedCache; +import cgeo.geocaching.utils.BaseUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +/** + * Test class to compare the performance of two regular expressions on given data. + * Can be used to improve the time needed to parse the cache data + * Run As "JUnit Test" + * + * @author blafoo + */ +public class RegExPerformanceTest extends TestCase { + + // Regular expression: "<img.*src=(\S*)/>" + // Input string 1: "<img border=1 src=image.jpg />" + // Input string 2: "<img src=src=src=src= .... many src= ... src=src=" + // "a(.*)a", it's much better to use "a([^a]*)a". + // The rewritten expression "<img((?!src=).)*src=(\S*)/>" will handle a large, non-matching string almost a hundred times faster then the previous one! + + private final static Pattern PATTERN_ACTUAL = Pattern.compile("<div id=\"div_hint\"[^>]*>(.*?)</div>", Pattern.CASE_INSENSITIVE); + + private static final Pattern PATTERN_IMPROVED = Pattern.compile( + "Additional Hints" + Constants.TAG_STRONG_END + + "[^\\(]*\\(<a" + Constants.NEXT_END_TAG2 + ">Encrypt</a>\\)" + Constants.TAG_P_END + + Constants.NEXT_START_TAG + "<div id=\"div_hint\"" + Constants.NEXT_END_TAG + ">(.*?)" + Constants.TAG_DIV_END + Constants.NEXT_START_TAG + "<div id='dk'"); + + + private String parseHint(String data, Pattern p, int group) { + String result = ""; + final Matcher matcherHint = p.matcher(data); + if (matcherHint.find() && matcherHint.groupCount() >= group && matcherHint.group(group) != null) { + // replace linebreak and paragraph tags + String hint = Pattern.compile("<(br|p)" + Constants.NEXT_END_TAG + ">").matcher(matcherHint.group(group)).replaceAll("\n"); + if (hint != null) { + result = hint.replaceAll(Pattern.quote(Constants.TAG_P_END), "").trim(); + } + } + return result; + } + + private String parseDescription(String data, Pattern p, int group) { + String result = null; + final Matcher matcher = p.matcher(data); + if (matcher.find() && matcher.groupCount() >= group) { + result = BaseUtils.getMatch(matcher.group(group)); + } + return result; + } + + + public void testRegEx() { + + List<MockedCache> cachesForParsing = new ArrayList<MockedCache>(); + cachesForParsing.add(new GC2CJPF()); + cachesForParsing.add(new GC1ZXX2()); + + int ITERATIONS = 250; // 250 for an fast evaluation, 10000 else + + for (MockedCache cache : cachesForParsing) { + String page = cache.getData(); + String resultOld = parseHint(page, PATTERN_ACTUAL, 1); + String resultNew = parseHint(page, PATTERN_IMPROVED, 1); + assertEquals(resultOld, resultNew); + + long diffOld, diffNew; + + System.out.println("Parsing " + cache.getGeocode() + " " + cache.getName()); + { + System.out.println(("Result actual pattern:\t<<" + resultOld + ">>")); + + long start = System.currentTimeMillis(); + for (int j = 0; j < ITERATIONS; j++) { + parseHint(page, PATTERN_ACTUAL, 1); + } + diffOld = (System.currentTimeMillis() - start); + System.out.println("Time actual pattern:\t" + diffOld + " ms"); + } + + { + System.out.println(("Result new pattern:\t<<" + resultNew + ">>")); + long start = System.currentTimeMillis(); + for (int j = 0; j < ITERATIONS; j++) { + parseHint(page, PATTERN_IMPROVED, 1); + } + diffNew = (System.currentTimeMillis() - start); + System.out.println("Time new pattern:\t" + diffNew + " ms"); + } + Float reduction = new Float((float) diffNew * 100 / (float) diffOld); + System.out.println("Reduction to x percent:\t" + reduction.toString() + "\n"); + } + + } +} diff --git a/tests/src/cgeo/geocaching/test/mock/MockedCache.java b/tests/src/cgeo/geocaching/test/mock/MockedCache.java index 7494028..8c04a18 100644 --- a/tests/src/cgeo/geocaching/test/mock/MockedCache.java +++ b/tests/src/cgeo/geocaching/test/mock/MockedCache.java @@ -1,7 +1,7 @@ package cgeo.geocaching.test.mock; import cgeo.geocaching.ICache; -import cgeo.geocaching.cgBase; +import cgeo.geocaching.utils.BaseUtils; import java.io.BufferedReader; import java.io.IOException; @@ -27,9 +27,10 @@ public abstract class MockedCache implements ICache { buffer.append(line).append("\n"); } + br.close(); - cgBase.replaceWhitespace(buffer); + BaseUtils.replaceWhitespace(buffer); return buffer.toString(); } catch (IOException e) { |
