Ejemplo n.º 1
0
        private static void ExtractUniqueQueryTokensBasedOnOccurance(ArrayList<ArrayList<QueryToken>> disjuncts, 
                                                                    ArrayList<ArrayList<QueryToken>> finalQueryList,
                                                                    double twitterProfilerFrequencyThreshold, 
                                                                    ArrayList<ArrayList<QueryToken>> subsetsGrouped, 
                                                                    ArrayList<int> subsetsGroupedOccurance, 
                                                                    int maxOccurance)
        {            
            var subsetsWithMaxOccurance = subsetsGrouped
                                            .Where((x, idx) => subsetsGroupedOccurance[idx] == maxOccurance)
                                            .OrderBy(x => x.Count)
                                            .ToList();

            // remove subsets which are contained in other ones
            ArrayList<int> subsetsToRemove = new ArrayList<int>();
            for (int i = 0; i < subsetsWithMaxOccurance.Count(); ++i)
            {
                for (int j = i + 1; j < subsetsWithMaxOccurance.Count(); ++j)
                {
                    if (ContainsTokenSubset(subsetsWithMaxOccurance[i], subsetsWithMaxOccurance[j]))
                    {
                        subsetsToRemove.Add(i);
                        break;
                    }
                }                 
            }

            subsetsToRemove.Sort();
            subsetsToRemove.Reverse();

            foreach (var toRemove in subsetsToRemove.Distinct())
            {
                subsetsWithMaxOccurance.RemoveAt(toRemove);
            }
            subsetsToRemove.Clear();

            if (maxOccurance > 1) // perform twitter profiling
            {
                for (int i = 0; i < subsetsWithMaxOccurance.Count(); ++i)
                {
                    //twitter profiling
                    string searchQuery = TwitterProfiler.QueryTokensToStringConverter(subsetsWithMaxOccurance[i]);
                    double twitterProfilerFrequency = TwitterProfiler.ProfileFrequency(searchQuery);

                    if (Math.Round(twitterProfilerFrequency) == Convert.ToInt32(TwitterProfiler.InvalidReturns.Exception))
                    {
                        Console.WriteLine("\nError during twitter profiling: {0}!", TwitterProfiler.GetExceptionError());
                        System.Environment.Exit(1);
                    }

                    if (twitterProfilerFrequency < twitterProfilerFrequencyThreshold)
                    {
                        int disjunctsCountBeforeRemoval = disjuncts.Count();

                        // remove disjuncts from the original list that contain the selected subset
                        QueryConverterUtils.RemoveDisjunctsContainingSubsets(disjuncts, subsetsWithMaxOccurance[i]);

                        if (disjuncts.Count() < disjunctsCountBeforeRemoval)
                            finalQueryList.Add(subsetsWithMaxOccurance[i]);

                        if (disjuncts.Count() == 0)
                            return;
                        else
                        {
                            if (i >= (subsetsWithMaxOccurance.Count() - 1))
                            {
                                // find new possible subsets
                                QueryConverterUtils.ExtractUniqueQueryTokens(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold);
                                return;
                            }
                        }
                    }
                    else
                    {
                        if (i >= (subsetsWithMaxOccurance.Count() - 1))
                        {
                            // select subsets with the first lower occurance
                            ArrayList<int> lowerOccurances = subsetsGroupedOccurance.FindAll(x => x < maxOccurance);
                            if (lowerOccurances.Count > 0)
                                maxOccurance = lowerOccurances.Max();
                            else
                                maxOccurance = 0;

                            if (maxOccurance < 1)
                                return;

                            QueryConverterUtils.ExtractUniqueQueryTokensBasedOnOccurance(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold, subsetsGrouped, subsetsGroupedOccurance, maxOccurance);
                            if (disjuncts.Count() == 0)
                                return;
                        }
                    }
                }
            }
            else // if occurrence is 1 -> profiling not needed
            {
                foreach (var item in subsetsWithMaxOccurance)
                {
                    int disjunctsCountBeforeRemoval = disjuncts.Count();                    

                    // remove disjuncts from the original list that contain the selected subset
                    QueryConverterUtils.RemoveDisjunctsContainingSubsets(disjuncts, item);

                    if (disjuncts.Count() < disjunctsCountBeforeRemoval)
                        finalQueryList.Add(item);

                    if (disjuncts.Count() == 0)
                        return;
                }

                // find new possible subsets
                QueryConverterUtils.ExtractUniqueQueryTokens(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold);
                return;
            }
        }