Ejemplo n.º 1
0
 public GroupEngine(SearchEngine searchEngine)
 {
     this.searchEngine = searchEngine;
 }
Ejemplo n.º 2
0
        public static void Main(string[] args)
        {
            SearchEngine searchEngine = new SearchEngine();

            const int SEARCH_ENGINE   = 1;
            const int GROUPING_ENGINE = 2;
            int       mode;

            Console.WriteLine("Choose mode: 1 - search engine, 2 - k-means grouping");
            while (true)
            {
                bool result = Int32.TryParse(Console.ReadLine(), out mode);
                if (result)
                {
                    if (mode != 1 && mode != 2)
                    {
                        Console.WriteLine("Invalid mode. Try again.");
                    }
                    else
                    {
                        break;
                    }
                }
                else
                {
                    Console.WriteLine("Invalid input. Try again.");
                }
            }

            Console.WriteLine("Type keywords file name.");
            while (true)
            {
                try {
                    searchEngine.readKeywordsFile(Console.ReadLine());
                    foreach (string keyword in searchEngine.keywords)
                    {
                        searchEngine.stemmedKeywords.Add(Utils.stemToken(keyword, searchEngine.stemmer));
                    }
                    searchEngine.stemmedKeywords = searchEngine.stemmedKeywords.Distinct().ToList();
                    break;
                } catch (FileNotFoundException) {
                    Console.WriteLine("File not found. Try again.");
                } catch (ArgumentException) {
                    Console.WriteLine("Invalid file name. Try again.");
                }
            }

            Console.WriteLine("Type documents file name.");
            while (true)
            {
                try {
                    searchEngine.readDocumentsFile(Console.ReadLine());
                    foreach (Document doc in searchEngine.documents)
                    {
                        if (mode == SEARCH_ENGINE)
                        {
                            doc.processDocument(searchEngine.stemmedKeywords, searchEngine.stemmer, true);
                        }
                        else
                        {
                            doc.processDocument(searchEngine.stemmedKeywords, searchEngine.stemmer, false);
                        }
                    }
                    searchEngine.IDFVector = searchEngine.createIDFVector();
                    foreach (Document doc in searchEngine.documents)
                    {
                        doc.TFIDFVector = Utils.multiplyVectorsCoords(doc.TFVector, searchEngine.IDFVector);
                    }
                    break;
                } catch (FileNotFoundException) {
                    Console.WriteLine("File not found. Try again.");
                } catch (ArgumentException) {
                    Console.WriteLine("Invalid file name. Try again.");
                }
            }



            GroupEngine groupEngine = new GroupEngine(searchEngine);

            string action  = "";
            bool   closing = false;

            while (true)
            {
                if (closing)
                {
                    break;
                }
                if (mode == SEARCH_ENGINE)
                {
                    Console.WriteLine("Choose action: 1 - search, 2 - display document, " +
                                      "3 - display tokens, 4 - display stemmed tokens, 5 - quit");
                }
                else
                {
                    Console.WriteLine("Choose action: 1 - classify, 2 - display document, " +
                                      "3 - display tokens, 4 - display stemmed tokens, 5 - quit");
                }
                action = Console.ReadLine();
                switch (action)
                {
                case "1":
                    if (mode == SEARCH_ENGINE)
                    {
                        Console.WriteLine("Type query.");
                        string        query              = Console.ReadLine();
                        string[]      queryTokens        = Utils.extractTokens(query);
                        List <string> stemmedQueryTokens = new List <string> ();
                        foreach (string token in queryTokens)
                        {
                            stemmedQueryTokens.Add(Utils.stemToken(token, searchEngine.stemmer));
                        }
                        List <double> queryTFVector;
                        queryTFVector = Utils.createTFVector(searchEngine.stemmedKeywords, stemmedQueryTokens);
                        List <double> queryTFIDFVector = Utils.multiplyVectorsCoords(queryTFVector, searchEngine.IDFVector);
                        Dictionary <Document, double> vectorsCosinus = new Dictionary <Document, double> ();
                        double cosinus = 0;
                        foreach (Document doc in searchEngine.documents)
                        {
                            cosinus = Utils.calculateVectorsCosinus(queryTFIDFVector, doc.TFIDFVector);
                            vectorsCosinus.Add(doc, cosinus);
                        }

                        var vectorsCosinusList = vectorsCosinus.ToList();
                        var sorted             = vectorsCosinusList.OrderByDescending(pair => pair.Value);
                        int index = 0;
                        foreach (KeyValuePair <Document, double> kvp in sorted)
                        {
                            Console.WriteLine(index + "." + kvp.Value + " " + kvp.Key.title);
                            index++;
                        }
                        Console.WriteLine("Do you want to choose relevent documents. 1 - yes, 2 - no");
                        var choose = Console.ReadLine();
                        switch (choose)
                        {
                        case "1":
                        {
                            Console.WriteLine("Choose numbers of relevant documents. Format: 'number of document' 'number od document ...");
                            var        StringDocumentsIndexes = Console.ReadLine();
                            List <int> intDocumentsIndexes    = Utils.takeIndexesOfDocuments(StringDocumentsIndexes);
                            var        relevant   = Utils.getRelevantDocuments(intDocumentsIndexes, sorted.ToList());
                            var        unRelevant = Utils.getUnRelevantDocuments(intDocumentsIndexes, sorted.ToList());
                            var        newQuery   = Utils.calculateNewQueryTFIDFQuery(relevant, unRelevant, queryTFIDFVector);
                            //create new list of keys
                            Dictionary <string, double> keyWords = new Dictionary <string, double> ();
                            for (int i = 0; i < newQuery.Count(); i++)
                            {
                                keyWords.Add(searchEngine.stemmedKeywords [i], newQuery [i]);
                            }
                            var sortedKeyWords = keyWords.OrderByDescending(i => i.Value);
                            foreach (var k in sortedKeyWords)
                            {
                                if (k.Value != 0)
                                {
                                    Console.WriteLine($"{k.Key} {k.Value}");
                                }
                            }
                            Console.WriteLine("Type new query");
                            string        query1              = Console.ReadLine();
                            string[]      queryTokens1        = Utils.extractTokens(query1);
                            List <string> stemmedQueryTokens1 = new List <string> ();
                            foreach (string token in queryTokens1)
                            {
                                stemmedQueryTokens1.Add(Utils.stemToken(token, searchEngine.stemmer));
                            }
                            List <double> queryTFVector1    = Utils.createTFVector(searchEngine.stemmedKeywords, stemmedQueryTokens1);
                            List <double> queryTFIDFVector1 = Utils.multiplyVectorsCoords(queryTFVector1, searchEngine.IDFVector);

                            Dictionary <Document, double> vectorsCosinus1 = new Dictionary <Document, double> ();
                            double cosinus1 = 0;
                            foreach (Document doc in searchEngine.documents)
                            {
                                cosinus1 = Utils.calculateVectorsCosinus(queryTFIDFVector1, doc.TFIDFVector);
                                vectorsCosinus1.Add(doc, cosinus1);
                            }

                            var vectorsCosinusList1 = vectorsCosinus1.ToList();
                            var sorted1             = vectorsCosinusList1.OrderByDescending(pair => pair.Value);
                            int index1 = 0;
                            foreach (KeyValuePair <Document, double> kvp in sorted1)
                            {
                                Console.WriteLine(index1 + ".   " + kvp.Value + " " + kvp.Key.title);
                                index1++;
                            }
                            break;
                        }

                        case "2":
                        {
                            break;
                        }
                        }
                    }
                    else if (mode == GROUPING_ENGINE)
                    {
                        //
                        int groupsNumber;
                        int iterationsNubmer;
                        Console.WriteLine("Type number of groups to classify documents.");

                        while (true)
                        {
                            bool result = Int32.TryParse(Console.ReadLine(), out groupsNumber);
                            if (result)
                            {
                                // TODO: define maximal number of groups
                                if (groupsNumber < 2 || groupsNumber > 20)
                                {
                                    Console.WriteLine("Invalid groups number. Try again.");
                                }
                                else
                                {
                                    break;
                                }
                            }
                            else
                            {
                                Console.WriteLine("Invalid input. Try again.");
                            }
                        }

                        Console.WriteLine("Type number of iterations of k-means algorithm.");
                        while (true)
                        {
                            bool result = Int32.TryParse(Console.ReadLine(), out iterationsNubmer);
                            if (result)
                            {
                                if (iterationsNubmer < 1)
                                {
                                    Console.WriteLine("Invalid iterations number. Try again.");
                                }
                                else
                                {
                                    break;
                                }
                            }
                            else
                            {
                                Console.WriteLine("Invalid input. Try again.");
                            }
                        }


                        //
                        groupEngine.classify(groupsNumber, iterationsNubmer);
                    }
                    break;

                case "2":
                    searchEngine.documents [searchEngine.readDocumentNumber()]
                    .displayDocument();
                    break;

                case "3":
                    searchEngine.documents [searchEngine.readDocumentNumber()]
                    .displayTokens();
                    break;

                case "4":
                    searchEngine.documents [searchEngine.readDocumentNumber()]
                    .displayStemmedTokens();
                    break;

                case "5":
                    Console.WriteLine("Closing.");
                    closing = true;
                    break;

                default:
                    Console.WriteLine("Incorrect option. Try again.");
                    break;
                }
            }
        }
Ejemplo n.º 3
0
        static int Main(string[] args)
        {
            try
            {
                for (int i = 0; i < args.Length; i++)
                {
                    if (args[i] == "--help" || args[i] == "-h")
                    {
                        Console.WriteLine($"--debug -d Show additional information");
                        Console.WriteLine($"--help -h Show help");
                        Console.WriteLine($"--memory-limit -m Max memory size. 0 for disable");
                        Console.WriteLine($"--normalize -n Pre-process every word before insert");
                        Console.WriteLine($"--pattern -p Pattern for removing unwanted characters, used for each word before insert");
                        Console.WriteLine($"--source -s Load data from specific path at start");
                        Console.WriteLine($"--extension -e Set extension for loading data at start");
                    }

                    if (args[i] == "--debug" || args[i] == "-d")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            debug = args[i + 1] == "true";
                        }
                    }

                    if (args[i] == "--memory-limit" || args[i] == "-m")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            memoryLimit = Convert.ToInt32(args[i + 1]);
                        }
                    }

                    if (args[i] == "--normalize" || args[i] == "-n")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            normalize = args[i + 1] == "true";
                        }
                    }

                    if (args[i] == "--orderFixed" || args[i] == "-of")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            orderFixed = args[i + 1] == "true";
                        }
                    }

                    if (args[i] == "--numberOfPermutation" || args[i] == "-nop")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            numberOfPermutation = int.TryParse(args[i + 1], out var nop) ? nop : 2;
                        }
                    }

                    if (args[i] == "--pattern" || args[i] == "-p")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            pattern = args[i + 1];
                        }
                    }

                    if (args[i] == "--source" || args[i] == "-s")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            initialSource = args[i + 1];
                        }
                    }

                    if (args[i] == "--extension" || args[i] == "-e")
                    {
                        if (args[i + 1].IndexOf("-") != 0)
                        {
                            initialExtension = args[i + 1];
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Wrong arguments {Environment.NewLine} {ex.ToString()}");
                return(-2);
            }

            // sample data http://mlg.ucd.ie/datasets/bbc.html
            _SearchEngine = new SearchEngine(debug, normalize, orderFixed, numberOfPermutation, pattern, memoryLimit);

            if (initialSource != "")
            {
                LoadFromSource(initialSource, initialExtension);
            }

            while (true)
            {
                Console.Write(_prompt);
                string userInput = Console.ReadLine();

                ParseInput(userInput);
            }
        }