static void evaluateOneRow(int batchArrayIndex, int[] batchIndices, int differentSeed, string rexStringsBase, Dictionary <int, Regex> regexMap, List <int> keyList, double minSimilarity, string rowFileBase, int nMatchStrings) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); int rowIndex = batchIndices[batchArrayIndex]; int nKeys = keyList.Count; double[] rowArray = new double[nKeys]; for (int rowArrayIndex = 0; rowArrayIndex < nKeys; rowArrayIndex++) { rowArray[rowArrayIndex] = initializedFlag; } Random gen = new Random(differentSeed); HashSet <string> matchingStrings_outer = Util.getRexGeneratedStrings(rowIndex, nKeys, rexStringsBase, nMatchStrings); int maxErrors = (int)((1 - minSimilarity) * nMatchStrings) + 1; //int maxErrors = nMatchStrings; //in order to protect against exponential worst-case regexes we chunk the row, //and wait in powers of two for each chunk. It's a mess but we cannot get inside of //this library code: 'regex_inner.Match(matchingString)' to do propper cancelation int[] nTimeouts = { 0 }; int chunkSize = 128; int nChunks = nKeys / chunkSize; int remainder = nKeys % chunkSize; for (int chunkIndex = 0; chunkIndex < nChunks; chunkIndex++) { int chunkStart = chunkIndex * chunkSize; int chunkStop = chunkStart + chunkSize; processChunk(rowIndex, rowArray, chunkStart, chunkStop, matchingStrings_outer, regexMap, maxErrors, keyList, nTimeouts); } if (remainder > 0) { int chunkStart = nChunks * chunkSize; int chunkStop = chunkStart + remainder; processChunk(rowIndex, rowArray, chunkStart, chunkStop, matchingStrings_outer, regexMap, maxErrors, keyList, nTimeouts); } MatrixRow mr = new MatrixRow(rowIndex, rowArray, nKeys); mr.writeRowToFile(rowFileBase, minSimilarity); stopwatch.Stop(); TimeSpan ts = stopwatch.Elapsed; string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10); Console.WriteLine("completed i: " + rowIndex + "/" + nKeys + " nTimeouts: " + nTimeouts[0] + " nMatchStrings:" + matchingStrings_outer.Count + " time taken: " + elapsedTime); }
private static void validateRow(int rowIndex, int nRows, List <int> keyList, Dictionary <int, Regex> regexMap, double minSimilarity, string allRowsBase, string rexStringsBase, int[] stressCounter, int nMatchingStrings) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); MatrixRow mr = new MatrixRow(allRowsBase, rowIndex, nRows); HashSet <string> matchingStrings_outer = Util.getRexGeneratedStrings(rowIndex, nRows, rexStringsBase, nMatchingStrings); int maxErrors = (int)((1 - minSimilarity) * nMatchingStrings) + 1; double[] values = mr.getValues(); int nTimeouts = 0; for (int j = 0; j < values.Length; j++) { //remember this keyList was built from the filteredCorpus, //with keys added in order int innerKey = keyList[j]; Regex regex_inner = regexMap[innerKey]; double similarity = values[j]; // note that when reading the row from file, everything that // was not a valid similarity or below the minimum was // initialized to verifiedTimeout, which makes sense in an // optimistic expectation that this step will be completed soon // after that, or has already been done. // so this will mean we only do a small fraction of most rows if (similarity == SimilarityMatrixBuilder.verifiedTimeoutFlag) { try { validateCell(j, values, matchingStrings_outer, regex_inner, maxErrors); } catch (RegexMatchTimeoutException e) { nTimeouts++; stressCounter[0]++; } } } mr.writeRowToFile(allRowsBase, minSimilarity); stopwatch.Stop(); TimeSpan ts = stopwatch.Elapsed; string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10); Console.WriteLine("verified i: " + rowIndex + "/" + nRows + " nTimeouts: " + nTimeouts + " nMatchStrings:" + matchingStrings_outer.Count + " time taken: " + elapsedTime); }