Ejemplo n.º 1
0
    private async Task processBatch(
        List <Sequence> batch,
        FrequentSequencesBatchResult resultDelegate,
        double minSupport,
        bool purge)
    {
        PrevBlockFileReadingTime = _blockProcessingTimer.ElapsedMilliseconds;

        CurrentIteration++;

        if (batch.Count > 0)
        {
            _algorithm.SetNextSequenceBatch(batch);
            batch.Clear();
        }

        Tuple <List <Sequence>, double> results = _algorithm.FrequentSequences(_minSupport);

        List <Sequence> sequences = results.Item1;

        if (_mineTopK && sequences.Count >= _k && purge)
        {
            sequences.Sort(Sequence.SequenceSorter);
            _minSupport = sequences[_k].Support / (double)NumTransactionsProcessed;
        }

        if (purge)
        {
            for (var i = sequences.Count - 1; i >= 0; i--)
            {
                if (sequences[i].Support < Math.Floor(NumTransactionsProcessed * minSupport))
                {
                    sequences.RemoveAt(i);
                }
            }
        }

        await resultDelegate(sequences, results.Item2, CurrentIteration);

        _blockProcessingTimer.Restart();
    }
Ejemplo n.º 2
0
    public async Task ProcessCSVFile(
        string filepath,
        double minSup,
        int k,
        bool mineTopK,
        int sampleSize,
        int dbSize,
        double errorTolerance,
        FrequentSequencesBatchResult resultDelegate,
        long killAfter)
    {
        Stopwatch killTimer = Stopwatch.StartNew();


        this._minSupport = minSup;
        this._k          = k;
        this._mineTopK   = mineTopK;

        _algorithm = new ProSecCo(minSup, mineTopK, errorTolerance, sampleSize, dbSize);

        // current batch to process
        List <Sequence> batch = new List <Sequence>();

        var didProcessSampleBlocks = false;

        // open the CSV file
        using (var reader = new StreamReader(File.OpenRead(filepath)))
        {
            _blockProcessingTimer.Restart();

            while (!reader.EndOfStream)
            {
                // read the current line
                string line = reader.ReadLine();

                // split the line by comma (assuming CSV file)
                string [] transactions = line.Split(new string[] { " -1 " }, StringSplitOptions.None);

                List <Transaction> tr = new List <Transaction>(transactions.Length);
                for (int i = 0; i < transactions.Length - 1; i++)
                {
                    string []   items = transactions[i].Split(' ');
                    Transaction trans = new Transaction(items);
                    tr.Add(trans);
                }

                Sequence sequence = new Sequence(tr);
                batch.Add(sequence);

                if (!didProcessSampleBlocks)
                {
                    // check if there are enough transactions
                    if (batch.Count % sampleSize == 0)
                    {
                        NumTransactionsProcessed += batch.Count;

                        // update the sample and get back
                        // the error for the sample
                        _blockProcessingTimer.Stop();
                        var sampleError = _algorithm.AddSequenceBatchToSampleWindow(batch);
                        _blockProcessingTimer.Start();

                        if (!_didGetTopKSupport && _mineTopK)
                        {
                            _minSupport        = _algorithm.GetTopKMinSupport(_k);
                            _didGetTopKSupport = true;
                        }

                        batch.Clear();

                        Console.WriteLine("[DEBUG]: Min support=" + _minSupport + " Error=" + sampleError);

                        // make sure we have enough transaction so that the error > minsup
                        if (sampleError <= _minSupport * 0.75)
                        {
                            Console.WriteLine("[DEBUG]: Starting progressive algorithm.");
                            didProcessSampleBlocks = true;
                            _blockProcessingTimer.Stop();
                            _algorithm.ProcessSampleWindow();
                            _blockProcessingTimer.Start();
                            await processBatch(batch, resultDelegate, _minSupport, false);
                        }
                    }
                }
                else if (didProcessSampleBlocks && batch.Count > 0 && batch.Count % sampleSize == 0)
                {
                    if (killAfter <= killTimer.ElapsedMilliseconds)
                    {
                        return; // kill the mining
                    }
                    NumTransactionsProcessed += batch.Count;

                    await processBatch(batch, resultDelegate, _minSupport, false);
                }
            }
        }

        NumTransactionsProcessed += batch.Count;
        if (!didProcessSampleBlocks)
        {
            _algorithm.AddSequenceBatchToSampleWindow(batch);
            batch.Clear();

            // handle small data set case
            _algorithm.ProcessSampleWindow();
            didProcessSampleBlocks = true;

            await processBatch(batch, resultDelegate, _minSupport, true);
        }
        else
        {
            await processBatch(batch, resultDelegate, _minSupport, true);
        }
    }