예제 #1
0
        static void evaluateOneRow(int batchArrayIndex, int[] batchIndices, int differentSeed, string rexStringsBase, Dictionary <int, Regex> regexMap, List <int> keyList, double minSimilarity, string rowFileBase, int nMatchStrings)
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            int rowIndex = batchIndices[batchArrayIndex];
            int nKeys    = keyList.Count;

            double[] rowArray = new double[nKeys];
            for (int rowArrayIndex = 0; rowArrayIndex < nKeys; rowArrayIndex++)
            {
                rowArray[rowArrayIndex] = initializedFlag;
            }
            Random gen = new Random(differentSeed);

            HashSet <string> matchingStrings_outer = Util.getRexGeneratedStrings(rowIndex, nKeys, rexStringsBase, nMatchStrings);
            int maxErrors = (int)((1 - minSimilarity) * nMatchStrings) + 1;

            //int maxErrors = nMatchStrings;

            //in order to protect against exponential worst-case regexes we chunk the row,
            //and wait in powers of two for each chunk.  It's a mess but we cannot get inside of
            //this library code: 'regex_inner.Match(matchingString)' to do propper cancelation
            int[] nTimeouts = { 0 };
            int   chunkSize = 128;
            int   nChunks   = nKeys / chunkSize;
            int   remainder = nKeys % chunkSize;

            for (int chunkIndex = 0; chunkIndex < nChunks; chunkIndex++)
            {
                int chunkStart = chunkIndex * chunkSize;
                int chunkStop  = chunkStart + chunkSize;
                processChunk(rowIndex, rowArray, chunkStart, chunkStop, matchingStrings_outer, regexMap, maxErrors, keyList, nTimeouts);
            }
            if (remainder > 0)
            {
                int chunkStart = nChunks * chunkSize;
                int chunkStop  = chunkStart + remainder;
                processChunk(rowIndex, rowArray, chunkStart, chunkStop, matchingStrings_outer, regexMap, maxErrors, keyList, nTimeouts);
            }
            MatrixRow mr = new MatrixRow(rowIndex, rowArray, nKeys);

            mr.writeRowToFile(rowFileBase, minSimilarity);

            stopwatch.Stop();
            TimeSpan ts          = stopwatch.Elapsed;
            string   elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);

            Console.WriteLine("completed i: " + rowIndex + "/" + nKeys + " nTimeouts: " + nTimeouts[0] + " nMatchStrings:" + matchingStrings_outer.Count + " time taken: " + elapsedTime);
        }
예제 #2
0
        private static void validateRow(int rowIndex, int nRows, List <int> keyList, Dictionary <int, Regex> regexMap, double minSimilarity, string allRowsBase, string rexStringsBase, int[] stressCounter, int nMatchingStrings)
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            MatrixRow        mr = new MatrixRow(allRowsBase, rowIndex, nRows);
            HashSet <string> matchingStrings_outer = Util.getRexGeneratedStrings(rowIndex, nRows, rexStringsBase, nMatchingStrings);
            int maxErrors = (int)((1 - minSimilarity) * nMatchingStrings) + 1;

            double[] values    = mr.getValues();
            int      nTimeouts = 0;



            for (int j = 0; j < values.Length; j++)
            {
                //remember this keyList was built from the filteredCorpus,
                //with keys added in order
                int    innerKey    = keyList[j];
                Regex  regex_inner = regexMap[innerKey];
                double similarity  = values[j];

                // note that when reading the row from file, everything that
                // was not a valid similarity or below the minimum was
                // initialized to verifiedTimeout, which makes sense in an
                // optimistic expectation that this step will be completed soon
                // after that, or has already been done.

                // so this will mean we only do a small fraction of most rows
                if (similarity == SimilarityMatrixBuilder.verifiedTimeoutFlag)
                {
                    try
                    {
                        validateCell(j, values, matchingStrings_outer, regex_inner, maxErrors);
                    }
                    catch (RegexMatchTimeoutException e)
                    {
                        nTimeouts++;
                        stressCounter[0]++;
                    }
                }
            }
            mr.writeRowToFile(allRowsBase, minSimilarity);
            stopwatch.Stop();
            TimeSpan ts          = stopwatch.Elapsed;
            string   elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);

            Console.WriteLine("verified i: " + rowIndex + "/" + nRows + " nTimeouts: " + nTimeouts + " nMatchStrings:" + matchingStrings_outer.Count + " time taken: " + elapsedTime);
        }