예제 #1
0
        public IList <string> TestNuveTokenizerReturnDelimsFalse(string text)
        {
            var            tokenizer = new NuveTokenizer();
            IList <string> tokens    = tokenizer.Tokenize(text);

            return(tokens);
        }
예제 #2
0
        public IList <string> TestNuveTokenizerReturnDelimsTrue(string text)
        {
            var            tokenizer = new NuveTokenizer(true);
            IList <string> tokens    = tokenizer.Tokenize(text);
            int            length    = string.Join("", tokens).Length;

            Assert.AreEqual(length, text.Length);
            return(tokens);
        }
        static void Main(string[] args)
        {
            MPI.Environment.Run(ref args, communicator =>
            {
                if (communicator.Rank == 0) //Master process
                {
                    Console.WriteLine("Process with rank " + communicator.Rank + " successfully running. Total rank " + communicator.Size);

                    //Timer
                    System.Diagnostics.Stopwatch watch = System.Diagnostics.Stopwatch.StartNew();

                    long totalLineNumber        = 0;
                    long intervalForEachProcess = 0;

                    FileStream readInput = File.OpenRead("news.txt");
                    totalLineNumber      = StreamExtensions.CountLines(readInput, default);
                    Console.WriteLine("Total line number in the input text: " + totalLineNumber);

                    intervalForEachProcess       = totalLineNumber / (communicator.Size - 1);
                    long startIndexOfEachProcess = 0;
                    //Reading number of lines in the input, and distributing them equally to all machines in the cluster
                    for (int rank = 1; rank < communicator.Size; rank++)
                    {
                        communicator.Send(startIndexOfEachProcess, rank, 0);
                        if (rank != communicator.Size - 1)
                        {
                            communicator.Send(startIndexOfEachProcess + intervalForEachProcess, rank, 0);
                        }
                        else
                        {
                            //Simply add the remaining part to last process
                            communicator.Send(totalLineNumber, rank, 0);
                        }
                        startIndexOfEachProcess += intervalForEachProcess;
                    }

                    //BiGram - count
                    Dictionary <String, int> hashMap = new Dictionary <String, int>();

                    int terminator = 0;
                    while (terminator != communicator.Size - 1)
                    {
                        String recievedBiGramString = communicator.Receive <String>(MPI.Communicator.anySource, 0);
                        //Console.WriteLine(recievedBiGramString);
                        if (recievedBiGramString == "###-TERMINATION-###")
                        {
                            terminator++;
                        }
                        else if (!hashMap.ContainsKey(recievedBiGramString))
                        {
                            hashMap.Add(recievedBiGramString, 1);
                        }
                        else
                        {
                            hashMap[recievedBiGramString]++;
                        }
                    }
                    Console.WriteLine("Loop terminated.");

                    //Handles with biGrams which are obtained from last word of an interval and first word of next interval
                    for (int i = 1; i < communicator.Size - 1; i++)
                    {
                        String firstWordOfInterval  = communicator.Receive <String>(i, i);
                        String lastWordOfLine       = communicator.Receive <String>(i + 1, i + 1);
                        String recievedBiGramString = lastWordOfLine + " " + firstWordOfInterval;

                        Console.WriteLine(recievedBiGramString + " " + i + " " + (i + 1));

                        if (!hashMap.ContainsKey(recievedBiGramString))
                        {
                            hashMap.Add(recievedBiGramString, 1);
                        }
                        else
                        {
                            hashMap[recievedBiGramString]++;
                        }
                    }
                    Console.WriteLine("Separation points done.");

                    watch.Stop();
                    long elapsedMs     = watch.ElapsedMilliseconds;
                    long elapsedSecond = elapsedMs / 1000;
                    Console.WriteLine("biGram analysis time: " + elapsedSecond / 60 + "  min" + elapsedSecond % 60 + " sec");

                    watch.Start();
                    KeyValuePair <String, int>[] output = countSort(hashMap);
                    watch.Stop();

                    long overall    = watch.ElapsedMilliseconds;
                    long overallSec = overall / 1000;
                    Console.WriteLine("Sort time: " + (overallSec - elapsedSecond) / 60 + "  min" + (overallSec - elapsedSecond) % 60 + " sec");
                    Console.WriteLine("Overall time: " + overallSec / 60 + "  min" + overallSec % 60 + " sec");

                    StreamWriter writer = File.CreateText("out.txt");
                    for (int i = 0; i < output.Length; i++)
                    {
                        writer.WriteLine(output[i]);
                    }
                    writer.Close();

                    Console.WriteLine("Output.txt has been written successfully.");
                }
                else // Processes that makes bigram analysis
                {
                    Console.WriteLine("Process with rank " + communicator.Rank + " successfully running.");
                    long intervalStart = communicator.Receive <long>(0, 0);
                    long intervalEnd   = communicator.Receive <long>(0, 0);
                    Console.WriteLine(intervalStart + " " + intervalEnd);

                    StreamReader readInput = new StreamReader("news.txt");

                    //Carries readStream's current index to the given line number by master process.
                    for (long i = 0; i < intervalStart; i++)
                    {
                        readInput.ReadLine();
                    }

                    NuveTokenizer tokenize = new NuveTokenizer(false);
                    IList <String> tokenizedInput;

                    String firstWordOfInterval = " ";
                    bool isFirstWord           = true;

                    // Last word of each line is temporarily held in this variable to be matched with first word of next line
                    // Otherwise, 1 biGram will be lost by every line
                    // Also used for holding last word of each interval too
                    String lastWordOfLine = " ";

                    //The main loop which makes bigram analysis
                    for (long i = intervalStart; i < intervalEnd; i++)
                    {
                        tokenizedInput = tokenize.Tokenize(readInput.ReadLine().ToString());
                        //Instead of natural language processing library Nuve, simply following line could have been used.
                        //tokenizedInput = readInput.ReadLine().ToString().Split();

                        if (isFirstWord && tokenizedInput.IsNotNullOrEmpty())
                        {
                            firstWordOfInterval = tokenizedInput[0];
                            isFirstWord         = false;
                        }

                        if (tokenizedInput.IsNotNullOrEmpty() && lastWordOfLine != " ")
                        {
                            String endLineStartingLineMergedBiGram = tokenizedInput[0] + " " + lastWordOfLine;
                            communicator.Send(endLineStartingLineMergedBiGram, 0, 0);
                        }

                        for (int k = 0; k < tokenizedInput.Count - 1; k++)
                        {
                            String biGram = tokenizedInput[k] + " " + tokenizedInput[k + 1];
                            communicator.Send(biGram, 0, 0);
                            lastWordOfLine = tokenizedInput[k + 1];
                        }
                    }
                    communicator.Send("###-TERMINATION-###", 0, 0);
                    //Console.WriteLine("###-TERMINATION-### sent from: " + communicator.Rank);

                    //Handles with biGrams which are obtained from last word of an interval and first word of next interval
                    if (communicator.Rank == 1)
                    {
                        Console.WriteLine("Trying to send from " + communicator.Rank);
                        communicator.Send(lastWordOfLine, 0, communicator.Rank);
                    }
                    else if (communicator.Rank == communicator.Size - 1)
                    {
                        Console.WriteLine("Trying to send from " + communicator.Rank);
                        communicator.Send(firstWordOfInterval, 0, communicator.Rank);
                    }
                    else
                    {
                        Console.WriteLine("Trying to send from " + communicator.Rank);
                        communicator.Send(firstWordOfInterval, 0, communicator.Rank);
                        communicator.Send(lastWordOfLine, 0, communicator.Rank);
                    }
                }
            });
        }