Ejemplo n.º 1
0
        static void Stem(Sample sample)
        {
            EnglishPorter2Stemmer    stemmer      = new EnglishPorter2Stemmer();
            Dictionary <string, int> stemmedWords = new Dictionary <string, int>();

            foreach (var word in sample.words)
            {
                var value      = word.Value;
                var key        = word.Key;
                var stemmedKey = stemmer.Stem(key).Value;

                if (stemmedWords.ContainsKey(stemmedKey))
                {
                    var valueHolder = stemmedWords.GetValueOrDefault(stemmedKey);
                    stemmedWords.Remove(stemmedKey);
                    stemmedWords.Add(stemmedKey, value + valueHolder);
                }
                else
                {
                    stemmedWords.Add(stemmedKey, value);
                }
            }
            sample.words.Clear();
            sample.words = stemmedWords;
        }
Ejemplo n.º 2
0
        public void Stem_WithBatchData_StemsAllWordsCorrectly(string unstemmed, string expected)
        {
            var stemmer = new EnglishPorter2Stemmer();

            var stemmed = stemmer.Stem(unstemmed).Value;

            Assert.AreEqual(expected, stemmed);
        }
Ejemplo n.º 3
0
        public List <string> parseText(string text)
        {
            var           charText   = text.ToCharArray();
            string        token      = string.Empty;
            List <string> textTokens = new List <string>();
            var           ep2s       = new EnglishPorter2Stemmer();

            for (int i = 0; i < charText.Length; i++)
            {
                char c = charText[i];
                if (char.IsLetterOrDigit(c) || IsArabic(c)) //Accept english and arabic only
                {
                    token += c;
                }
                else
                {
                    if (!string.IsNullOrEmpty(token))
                    {
                        if (token.All(ch => char.IsLetterOrDigit(ch)))
                        {
                            textTokens.Add(ep2s.Stem(token).Value); //stem then add
                        }
                        else //for arabic
                        {
                            textTokens.Add(token);
                        }
                        token = string.Empty;
                    }
                }
            }
            if (!string.IsNullOrEmpty(token)) //for last token
            {
                if (token.All(ch => char.IsLetterOrDigit(ch)))
                {
                    textTokens.Add(ep2s.Stem(token).Value); //stem then add
                }
                else //for arabic
                {
                    textTokens.Add(token);
                }
                token = string.Empty;
            }
            return(textTokens);
        }
Ejemplo n.º 4
0
 public List <string> goStemmer(string[] filteredTokens)
 {
     outList = new List <string>();
     foreach (string token in filteredTokens)
     {
         outWord = toStem.Stem(token);
         outList.Add(outWord.Value);
     }
     return(outList);
 }
        private string CutPatternWords(string[] words, string patternWordsType)
        {
            string output      = "";
            string patternSign = String.Concat(">", patternWordsType.Substring(0, 1).ToLower());

            _cutWords.Add(patternSign, new LinkedList <string>());
            foreach (string word in words)
            {
                if (_wordsTypes[patternWordsType].Contains(patternSign == ">v" ? stemmer.Stem(word).Value : word))  // if verb stemming it
                {
                    output += patternSign + " ";
                    _cutWords[patternSign].AddLast(patternSign == ">v" ? stemmer.Stem(word).Value : word);
                }
                else
                {
                    output += word + " ";
                }
            }
            return(output.Trim());
        }
Ejemplo n.º 6
0
        public IActionResult Search(SearchViewModel viewModel)
        {
            var           searchTokens  = viewModel.search.Split(" ");
            List <string> stemmedTokens = new List <string>();

            foreach (string s in searchTokens)
            {
                stemmedTokens.Add(ps.Stem(s).Value);
            }
            List <List <float> > links = new List <List <float> >();

            foreach (string s in stemmedTokens)
            {
                JArray j = new JArray();


                List <float> index = new List <float>();
                try
                {
                    index = invIndex.jsonData[s].ToObject <List <float> >();
                }
                catch
                {
                    continue;
                }
                links.Add(index);
            }
            List <float> commonList = new List <float>();
            int          ctr        = 0;

            foreach (List <float> l in links)
            {
                if (ctr == 0)
                {
                    commonList.AddRange(l);
                    ctr++;
                    continue;
                }
                commonList = commonList.Intersect <float>(l).ToList();
            }
            foreach (var i in links)
            {
                foreach (float f in i)
                {
                    if (!commonList.Contains(f))
                    {
                        commonList.Add(f);
                    }
                }
            }
            ViewData["List"] = commonList;
            return(View());
        }
Ejemplo n.º 7
0
        public void Stem_WithBatchData_StemsAllWordsCorrectly()
        {
            // Arrange
            var stemmer   = new EnglishPorter2Stemmer();
            var row       = TestContext.DataRow;
            var unstemmed = row[0].ToString();
            var expected  = row[1].ToString();

            // Act
            var stemmed = stemmer.Stem(unstemmed).Value;

            // Asssert
            Assert.AreEqual(expected, stemmed);
        }
Ejemplo n.º 8
0
        public string stemming(string webcontent)
        {
            //the code for stemming was already implemented online. I just added the project in my solution.
            EnglishPorter2Stemmer stem = new EnglishPorter2Stemmer();


            string[] words        = webcontent.Split(' ');
            string   stemmedwords = "";


            foreach (var word in words)
            {
                stemmedwords = stemmedwords + " " + stem.Stem(word).Value;
            }

            return(stemmedwords);
        }
        public void Stem_WithBatchData_StemsAllWordsCorrectly()
        {
            var tests = StemBatchTestCaseSource.GetTestCaseData();

            foreach (var batchTestDataModel in tests)
            {
                // Arrange
                var stemmer   = new EnglishPorter2Stemmer();
                var unstemmed = batchTestDataModel.Unstemmed;
                var expected  = batchTestDataModel.Expected;

                // Act
                var stemmed = stemmer.Stem(unstemmed).Value;

                // Asssert
                Assert.AreEqual(expected, stemmed);
            }
        }
        public void Handle()
        {
            for (int k = 0; k < _documents[_documentIndex].Count; k++)
            {
                TFIDFNote tfidfNote = new TFIDFNote();
                tfidfNote.Word = _documents[_documentIndex][k];

                // Fill the values list for each document:
                // With word stemming:
                EnglishPorter2Stemmer englishPorter = new EnglishPorter2Stemmer();
                tfidfNote.ValuesList = new List <TFIDFValue>();
                for (int j = 0; j < _documents.Count; j++)
                {
                    tfidfNote.ValuesList.Add(
                        new TFIDFValue
                    {
                        DocumentName = _filePathList[j],
                        Value        = GetTFIDFValue(englishPorter.Stem(_documents[_documentIndex][k]), _documents[j], _documents)
                    });
                }

                DocumentDictionary.Add(tfidfNote);
            }
        }
Ejemplo n.º 11
0
        public Task <string[]> Apply(string[] input)
        {
            EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();

            return(Task.FromResult(input?.Select(s => stemmer.Stem(s)).Where(w => !string.IsNullOrWhiteSpace(w)).ToArray()));
        }
Ejemplo n.º 12
0
 public override string Stem(string word)
 {
     return(stemmer.Stem(word).Value);
 }
Ejemplo n.º 13
0
 public static string Stem(string word)
 {
     return(_stemmer.Stem(word).Value);
 }
Ejemplo n.º 14
0
        public void optimizeQueryTokens(ref List <string> tokens, bool INSERT_AND = false)
        {
            var ep2s = new EnglishPorter2Stemmer();

            if (_Algorithm != Algorithm.TFIDFSearchModel)
            {
                for (int i = 0; i < tokens.Count; i++)
                {
                    if (tokens[i] != "(" && tokens[i] != ")" && tokens[i] != "NOT" && tokens[i] != "AND" && tokens[i] != "OR")
                    {
                        //Insert Missinng ORs
                        if (i + 1 < tokens.Count)
                        {
                            if (tokens[i + 1] != "(" && tokens[i + 1] != ")" && tokens[i + 1] != "AND" && tokens[i + 1] != "OR")
                            {
                                if (INSERT_AND)
                                {
                                    tokens.Insert(i + 1, "AND");
                                }
                                else
                                {
                                    tokens.Insert(i + 1, "OR");
                                }
                            }
                        }
                        //Stem Tokens using porter2Stemmer
                        if (tokens[i].All(ch => char.IsLetterOrDigit(ch)))    //if english
                        {
                            tokens[i] = ep2s.Stem(tokens[i].ToLower()).Value; //stem then add
                        }
                    }
                    else if (tokens[i] == ")")
                    {
                        if (i + 1 < tokens.Count)
                        {
                            if (tokens[i + 1] != ")" && tokens[i + 1] != "AND" && tokens[i + 1] != "OR")
                            {
                                if (INSERT_AND)
                                {
                                    tokens.Insert(i + 1, "AND");
                                }
                                else
                                {
                                    tokens.Insert(i + 1, "OR");
                                }
                            }
                        }
                    }
                }
            }
            else // if(_Algorithm == Algorithm.TFIDFSearchModel)
            {
                for (int i = 0; i < tokens.Count; i++)
                {
                    //Stem Tokens using porter2Stemmer
                    if (tokens[i].All(ch => char.IsLetterOrDigit(ch)))    //if english
                    {
                        tokens[i] = ep2s.Stem(tokens[i].ToLower()).Value; //stem then add
                    }
                }
            }
        }
        public void Stem_WithBatchData_StemsAllWordsCorrectly()
        {
            // Arrange
            var stemmer = new EnglishPorter2Stemmer();
            var row = TestContext.DataRow;
            var unstemmed = row[0].ToString();
            var expected = row[1].ToString();

            // Act
            var stemmed = stemmer.Stem(unstemmed).Value;

            // Asssert
            Assert.AreEqual(expected, stemmed);
        }
        protected void Button1_Click(object sender, EventArgs e)
        {
            String query = Query.Text;

            Mean.Text = "";
            Results.Items.Clear();
            NonResults.Items.Clear();
            Suggested.Items.Clear();
            Proximity.Items.Clear();
            if (query.Length == 0)
            {
                return;
            }
            queryTerms        = new List <String>();
            StemmedQueryTerms = new List <String>();
            TE = new IRTest_Entities();

            bool Spelling    = Spell.Checked;
            bool Soundex     = Sound.Checked;
            bool ExactSearch = true;

            if (query[0] != '"')
            {
                ExactSearch = false;
            }

            //Retrieving Query Terms
            int    Begin = 0;
            int    End   = query.Length;
            String Term  = "";

            if (ExactSearch == true)
            {
                Begin++;
                End--;
            }
            for (int i = Begin; i < End; i++)
            {
                if (query[i] == ' ')
                {
                    if (Term.Length > 0)
                    {
                        String TTerm = "";
                        for (int j = 0; j < Term.Length; j++)
                        {
                            TTerm += Char.ToLower(Term[j]);
                        }
                        queryTerms.Add(TTerm);
                        EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();
                        String STTERM = stemmer.Stem(TTerm).Value;
                        StemmedQueryTerms.Add(STTERM);
                    }
                    Term = "";
                }
                else
                {
                    Term += query[i];
                    if (i == End - 1)
                    {
                        String TTerm = "";
                        for (int j = 0; j < Term.Length; j++)
                        {
                            TTerm += Char.ToLower(Term[j]);
                        }
                        queryTerms.Add(TTerm);
                        EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();
                        String STTERM = stemmer.Stem(TTerm).Value;
                        StemmedQueryTerms.Add(STTERM);
                    }
                }
            }

            //Spelling Correction
            if (Spelling == true)
            {
                String CorrectedWords = "";
                for (int i = 0; i < queryTerms.Count; i++)
                {
                    List <String> SimilarWords = new List <String>();
                    String        Tquery       = "$";
                    Tquery += queryTerms[i];
                    Tquery += "$";
                    for (int j = 0; j < Tquery.Length - 1; j++)
                    {
                        String        Gram           = Tquery.Substring(j, 2);
                        List <String> CandidateTerms = new List <String>();
                        foreach (Bigram BG in TE.Bigrams)
                        {
                            if (Gram.Equals(BG.gram))
                            {
                                String DictionaryTerms = BG.terms;
                                String Temp            = "";
                                for (int u = 0; u < DictionaryTerms.Length; u++)
                                {
                                    if (DictionaryTerms[u] == ' ')
                                    {
                                        if (Temp.Length > 0)
                                        {
                                            String TTemp = "$";
                                            TTemp += Temp;
                                            TTemp += "$";
                                            CandidateTerms.Add(TTemp);
                                        }
                                        Temp = "";
                                    }
                                    else
                                    {
                                        Temp += DictionaryTerms[u];
                                        if (u == DictionaryTerms.Length - 1)
                                        {
                                            String TTemp = "$";
                                            TTemp += Temp;
                                            TTemp += "$";
                                            CandidateTerms.Add(TTemp);
                                        }
                                    }
                                }
                                break;
                            }
                        }
                        for (int u = 0; u < CandidateTerms.Count; u++)
                        {
                            String Candidate   = CandidateTerms[u];
                            double CommonGrams = 0;
                            double QueryGrams  = Tquery.Length - 1;
                            double TermGrams   = Candidate.Length - 1;
                            for (int f = 0; f < Tquery.Length - 1; f++)
                            {
                                for (int ff = 0; ff < Candidate.Length - 1; ff++)
                                {
                                    if (Tquery.Substring(f, 2).Equals(Candidate.Substring(ff, 2)))
                                    {
                                        CommonGrams++;
                                        break;
                                    }
                                }
                            }
                            double Jaccard = (2.0 * CommonGrams) / (QueryGrams + TermGrams);
                            Jaccard *= 100.0;
                            if (Jaccard >= 45)
                            {
                                if (!(SimilarWords.Contains(Candidate)))
                                {
                                    SimilarWords.Add(Candidate);
                                }
                            }
                        }
                    }
                    List <int> Distances = new List <int>();
                    for (int f = 0; f < SimilarWords.Count; f++)
                    {
                        String Temp = "";
                        for (int ff = 1; ff < SimilarWords[f].Length - 1; ff++)
                        {
                            Temp += SimilarWords[f][ff];
                        }
                        int _EditDistance = EditDistance(queryTerms[i], Temp);
                        Distances.Add(_EditDistance);
                    }
                    //Sorting
                    for (int write = 0; write < SimilarWords.Count; write++)
                    {
                        for (int sort = 0; sort < SimilarWords.Count - 1; sort++)
                        {
                            if (Distances[sort] > Distances[sort + 1])
                            {
                                int temp = Distances[sort + 1];
                                Distances[sort + 1] = Distances[sort];
                                Distances[sort]     = temp;

                                String TSTR = SimilarWords[sort + 1];
                                SimilarWords[sort + 1] = SimilarWords[sort];
                                SimilarWords[sort]     = TSTR;
                            }
                        }
                    }
                    if (SimilarWords.Count > 0)
                    {
                        CorrectedWords += SimilarWords[0].Substring(1, SimilarWords[0].Length - 2);
                    }
                    else
                    {
                        CorrectedWords += "NULL";
                    }
                    if (i < queryTerms.Count - 1)
                    {
                        CorrectedWords += ' ';
                    }
                    for (int f = 0; f < SimilarWords.Count; f++)
                    {
                        Suggested.Items.Add(SimilarWords[f].Substring(1, SimilarWords[f].Length - 2));
                    }
                }
                Mean.Text = CorrectedWords;
            }

            //Soundex
            if (Soundex == true)
            {
                List <int> DOCS  = new List <int>();
                List <int> FREQS = new List <int>();
                String     Code  = ComputeSoundex(queryTerms[0]);
                foreach (SoundCode SC in TE.SoundCodes)
                {
                    if (Code.Equals(SC.code))
                    {
                        String PTerms = SC.terms;
                        String Temp2  = "";
                        for (int i = 0; i < PTerms.Length; i++)
                        {
                            if (PTerms[i] == ' ')
                            {
                                if (Temp2.Length > 0)
                                {
                                    EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();
                                    String STEMMED = stemmer.Stem(Temp2).Value;
                                    foreach (II_Stemming II in TE.II_Stemming)
                                    {
                                        if (STEMMED == II.name)
                                        {
                                            int _docid     = (int)II.docid;
                                            int _frequency = (int)II.frequency;
                                            DOCS.Add(_docid);
                                            FREQS.Add(_frequency);
                                        }
                                    }
                                }
                                Temp2 = "";
                            }
                            else
                            {
                                Temp2 += PTerms[i];
                                if (i == PTerms.Length - 1)
                                {
                                    EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();
                                    String STEMMED = stemmer.Stem(Temp2).Value;
                                    foreach (II_Stemming II in TE.II_Stemming)
                                    {
                                        if (STEMMED == II.name)
                                        {
                                            int _docid     = (int)II.docid;
                                            int _frequency = (int)II.frequency;
                                            DOCS.Add(_docid);
                                            FREQS.Add(_frequency);
                                        }
                                    }
                                }
                            }
                        }
                        break;
                    }
                }
                //Sorting
                for (int write = 0; write < FREQS.Count; write++)
                {
                    for (int sort = 0; sort < FREQS.Count - 1; sort++)
                    {
                        if (FREQS[sort] < FREQS[sort + 1])
                        {
                            int temp = FREQS[sort + 1];
                            FREQS[sort + 1] = FREQS[sort];
                            FREQS[sort]     = temp;

                            int TINT = DOCS[sort + 1];
                            DOCS[sort + 1] = DOCS[sort];
                            DOCS[sort]     = TINT;
                        }
                    }
                }
                List <String> URLS = new List <String>();
                foreach (EnglishPage EP in TE.EnglishPages.ToList())
                {
                    URLS.Add(EP.URL);
                }
                for (int i = 0; i < DOCS.Count; i++)
                {
                    Results.Items.Add(URLS[DOCS[i] - 1]);
                    Proximity.Items.Add(FREQS[i].ToString());
                }
                //for (int i = 0; i < SimilarWords.Count; i++) Results.Items.Add(SimilarWords[i]);
            }
            else if (queryTerms.Count > 0)
            {
                TERMS   = new Dictionary <String, Dictionary <int, List <int> > >();
                DBNames = new List <String>();
                List <String> URLS = new List <String>();
                foreach (EnglishPage EP in TE.EnglishPages.ToList())
                {
                    URLS.Add(EP.URL);
                }
                foreach (II_Stemming II in TE.II_Stemming.ToList())
                {
                    DBNames.Add(II.name);
                    if (StemmedQueryTerms.Contains(II.name) == true)
                    {
                        if (TERMS.ContainsKey(II.name) == true)
                        {
                            if (TERMS[II.name].ContainsKey((int)II.docid) == false)
                            {
                                String     POS  = II.positions;
                                int        C    = 0;
                                List <int> POSS = new List <int>();
                                for (int i = 1; i < POS.Length; i++)
                                {
                                    if (POS[i] == ' ')
                                    {
                                        POSS.Add(C);
                                        C = 0;
                                    }
                                    else
                                    {
                                        C = (C * 10) + ((int)(POS[i] - '0'));
                                        if (i == POS.Length - 1)
                                        {
                                            POSS.Add(C);
                                        }
                                    }
                                }
                                TERMS[II.name].Add((int)II.docid, POSS);
                            }
                        }
                        else
                        {
                            TERMS.Add(II.name, new Dictionary <int, List <int> >());
                            String     POS  = II.positions;
                            int        C    = 0;
                            List <int> POSS = new List <int>();
                            for (int i = 1; i < POS.Length; i++)
                            {
                                if (POS[i] == ' ')
                                {
                                    POSS.Add(C);
                                    C = 0;
                                }
                                else
                                {
                                    C = (C * 10) + ((int)(POS[i] - '0'));
                                    if (i == POS.Length - 1)
                                    {
                                        POSS.Add(C);
                                    }
                                }
                            }
                            TERMS[II.name].Add((int)II.docid, POSS);
                        }
                    }
                }
                if (ExactSearch == true)
                {
                    DOCS_EXACT = new Dictionary <int, int>();
                    int[]      DOCS_COUNT         = new int[1610];
                    List <int> NON_CANDIDATE_DOCS = new List <int>();
                    foreach (KeyValuePair <String, Dictionary <int, List <int> > > EE in TERMS)
                    {
                        foreach (KeyValuePair <int, List <int> > EE2 in EE.Value)
                        {
                            DOCS_COUNT[EE2.Key]++;
                        }
                    }
                    for (int i = 1; i <= 1600; i++)
                    {
                        if (DOCS_COUNT[i] < StemmedQueryTerms.Count && DOCS_COUNT[i] > 0)
                        {
                            NON_CANDIDATE_DOCS.Add(i);
                        }
                    }
                    bool       TR     = SOLVE(0, queryTerms.Count, 0, 0);
                    List <int> DOCS_  = new List <int>();
                    List <int> Freqs_ = new List <int>();
                    foreach (KeyValuePair <int, int> EE in DOCS_EXACT)
                    {
                        DOCS_.Add(EE.Key);
                        Freqs_.Add(EE.Value);
                    }
                    //sorting
                    for (int write = 0; write < Freqs_.Count; write++)
                    {
                        for (int sort = 0; sort < Freqs_.Count - 1; sort++)
                        {
                            if (Freqs_[sort] < Freqs_[sort + 1])
                            {
                                int temp = Freqs_[sort + 1];
                                Freqs_[sort + 1] = Freqs_[sort];
                                Freqs_[sort]     = temp;

                                int TINT = DOCS_[sort + 1];
                                DOCS_[sort + 1] = DOCS_[sort];
                                DOCS_[sort]     = TINT;
                            }
                        }
                    }
                    for (int i = 0; i < DOCS_.Count; i++)
                    {
                        Results.Items.Add(URLS[DOCS_[i] - 1]);
                        Proximity.Items.Add(Freqs_[i].ToString());
                    }
                    //NON COMMON RESULTS
                    for (int i = 0; i < NON_CANDIDATE_DOCS.Count; i++)
                    {
                        NonResults.Items.Add(URLS[NON_CANDIDATE_DOCS[i] - 1]);
                    }
                }
                else
                {
                    DOCS_RANDOM   = new List <int>();
                    DOCS_DIS      = new List <int>();
                    NON_CANDIDATE = new List <int>();
                    SOLVE_RANDOM(queryTerms.Count);
                    for (int i = 0; i < DOCS_RANDOM.Count; i++)
                    {
                        Results.Items.Add(URLS[DOCS_RANDOM[i] - 1]);
                        Proximity.Items.Add(DOCS_DIS[i].ToString());
                    }
                    //NON COMMON RESULTS
                    for (int i = 0; i < NON_CANDIDATE.Count; i++)
                    {
                        NonResults.Items.Add(URLS[NON_CANDIDATE[i] - 1]);
                    }
                }
            }
        }
Ejemplo n.º 17
0
        static void Main(string[] args)
        {
            List <string> properPlaces = new List <string> {
                "usa", "west-germany", "france", "uk", "japan", "canada"
            };
            List <string> myPlaces = new List <string> {
            };
            List <string> myBody   = new List <string> {
            };
            List <TestObj> Testowe = new List <TestObj> {
            };


            DirectoryInfo d = new DirectoryInfo(@"E:\Pulpit\1111111"); //Assuming Test is your Folder

            FileInfo[] Files = d.GetFiles("*.sgm");                    //Getting Text files
            int        o     = 0;

            foreach (FileInfo file in Files)
            {
                const Int32 BufferSize = 128;
                //string path = @"E:\Pulpit\1111111\reut2-001.sgm";
                using (var fileStream = File.OpenRead(file.FullName))
                    using (var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize))
                    {
                        //int i = 0;
                        String line;
                        String wynik = "";;

                        line = streamReader.ReadLine();

                        while ((line = streamReader.ReadLine()) != null)
                        {
                            //Podmieniam znaki specjalne na takie ktore nie wchodza w konflikt z xml
                            wynik += ReplaceHexadecimalSymbols(line);

                            //if (line.Contains("<PLACES>"))
                            //{ }

                            //Console.WriteLine(line);


                            //i++;
                            //if (i > 20)
                            //{ break; }
                        }

                        var Myreplacedxml = "<root>" + wynik + "</root>";
                        //wynik += ("<root>" + wynik + "</root>");


                        XmlSerializer Serializer = new XmlSerializer(typeof(root));
                        root          result;
                        using (TextReader reader = new StringReader(Myreplacedxml))
                            result = (root)Serializer.Deserialize(reader);

                        foreach (rootREUTERS rootREUTER in result.REUTERS)
                        {
                            if (rootREUTER.PLACES.Length != 1)
                            {
                                continue;
                            }
                            else if (!properPlaces.Contains(rootREUTER.PLACES[0]))
                            {
                                continue;
                            }
                            else
                            {
                                if (!(rootREUTER.TEXT.BODY is null))
                                {
                                    myPlaces.Add(rootREUTER.PLACES[0]);
                                    myBody.Add(rootREUTER.TEXT.BODY);



                                    //Console.WriteLine(rootREUTER.TEXT.BODY);
                                }

                                //Console.WriteLine(rootREUTER.PLACES[0]);
                            }
                        }



                        //Console.ReadKey();
                        // Process line
                    }
                //odczyt jenego pliku

                //o++;
                //if(o > 9) {
                //    break;
                Console.WriteLine("Processing " + file.FullName + "...");
                // }
            }



            EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer();

            List <int> feature2 = new List <int> {
            };


            //int i = 0;
            //while (myBody[i] != null)

            //Tutaj testuje stemowanie i wybieranie + wpisywanie cech do listyw obiekcie
            List <string> allwords = new List <string> {
            };
            Dictionary <string, int> wordsDictionary = new Dictionary <string, int>();

            for (int i = 0; i < myBody.Count; i++)
            {
                char[]        separator = { '.', ',', ' ', '\t', '"', '=', '-', '<', '>', ')', '(', ';' };
                string[]      worlds    = myBody[i].Split(separator);
                List <double> feature1  = new List <double> {
                };


                string pom;
                double world7 = 0;
                double worldS = 0;
                //int world5 = 0;


                foreach (string world in worlds)
                {
                    pom = stemmer.Stem(world).Value;

                    if (!wordsDictionary.ContainsKey(pom))
                    {
                        wordsDictionary.Add(pom, 1);
                    }
                    else
                    {
                        wordsDictionary[pom]++;
                    }
                    //Console.WriteLine(pom);

                    //Zliczam ilosc slow dluzszych powyzej 7 znakow
                }
            }
            foreach (KeyValuePair <string, int> kvp in wordsDictionary)
            {
                if (kvp.Value > 10)
                {
                    allwords.Add(kvp.Key);
                }
            }

            //double w1 = worldS;
            //double w2 = world7;
            //double normal = Math.Sqrt((w1*w1) + (w2*w2));
            //world7 = world7 / normal;
            //worldS = worldS / normal;


            // feature1.Add(worldS);
            // feature1.Add(world7);
            // Testowe.Add(new TestObj(myPlaces[i], feature1));



            //Dodaje cechy i tworze obiekty
            for (int i = 0; i < myBody.Count; i++)
            {
                string[] worlds = myBody[i].Split(' ');

                List <double> feature1 = new List <double> {
                };


                string pom;
                for (int j = 0; j < worlds.Length; j++)
                {
                    pom       = stemmer.Stem(worlds[j]).Value;
                    worlds[j] = pom;
                }

                foreach (string world in allwords)
                {
                    //allwords przechowuje wystąpienia słowa, myBody.Count to liczba dokumentów
                    double counter = 0;
                    if (worlds.Contains(world))
                    {
                        foreach (string wrd in worlds)
                        {
                            if (wrd.Equals(world))
                            {
                                counter++;
                            }
                        }
                        //tf idf basically, myBody.Count to liczba obiektow aworldsDictionary przechowuje wystapienia slow w calym zbiorze
                        double pomocy = (double)myBody.Count / (double)wordsDictionary[world];
                        counter = counter * Math.Log(pomocy);
                        feature1.Add(counter);
                    }
                    else
                    {
                        feature1.Add(0);
                    }
                }



                Testowe.Add(new TestObj(myPlaces[i], feature1));
            }
            // Nie jest kolorowo
            // zapisuje do pliku formatu csv żeby potem przetworzyć w pythonie

            /*
             * using (System.IO.StreamWriter file =
             * new System.IO.StreamWriter(@"E:\Pulpit\Reuters_reduce_usa.csv"))
             * {
             *  string LineToWrite = "labels,";
             *  foreach (string line in allwords)
             *  {
             *
             *      LineToWrite += (line + ",");
             *
             *
             *
             *  }
             *
             *  file.WriteLine(LineToWrite);
             *  int liczusa = 0;
             *  foreach (TestObj testObj in Testowe)
             *  {
             *      if ((testObj.label.Equals("usa"))) { liczusa++; }
             *      if ((liczusa == 10) || (!testObj.label.Equals("usa")))
             *      {
             *                  LineToWrite = (testObj.label + ",");
             *          foreach(int feature in testObj.features)
             *          {
             *              LineToWrite += feature.ToString() + ",";
             *          }
             *          file.WriteLine(LineToWrite);
             *          if(liczusa == 10) { liczusa = 0; }
             *
             *      }
             *  }
             *
             * }
             * Console.WriteLine("Zapisywanie ukończone");
             */

            // Moje piękne wywołanie algorytmu
            KNN(10, Testowe, 5);

            double KNN(int k, List <TestObj> objs, int odsetek)
            {
                double        accuracy      = 0;
                List <string> Predictions   = new List <string> {
                };
                List <TestObj> Tests        = new List <TestObj> {
                };
                List <TestObj> Verification = new List <TestObj> {
                };
                //dzielę zbior na testowy i testowany
                //TODO zrobic mozliwosc wyboru odsetka ele w zb testowym
                int DeleteUsa = 0;
                int cykle     = 0;
                int poprawne  = 0;
                int licz      = 1;

                foreach (TestObj obj in objs)
                {
                    //Dwa ify do usuwania 9/10 usa
                    if (obj.label.Equals("usa"))
                    {
                        DeleteUsa++;
                    }
                    if ((DeleteUsa == 10) || (!obj.label.Equals("usa")))
                    {
                        if (licz <= odsetek)
                        {
                            Verification.Add(obj);
                            licz++;
                        }
                        else if ((licz > odsetek) && (licz < 10))
                        {
                            Tests.Add(obj);
                            licz++;
                        }
                        else
                        {
                            Tests.Add(obj);
                            licz = 1;
                        }
                        DeleteUsa = 0;
                    }
                }
                Console.WriteLine(Tests.Count + " | " + Verification.Count);

                //Tu sie bedzie dzialo
                foreach (TestObj oTe in Tests)
                {
                    List <double> Neighbours = new List <double> {
                    };
                    List <string> Nlabels    = new List <string> {
                    };


                    double sqdistance;
                    int    iterator = 0;

                    foreach (TestObj oVe in Verification)
                    {
                        double distance = 0;
                        int    i        = 0;
                        while (i < oVe.features.Count)
                        {
                            distance += ((oTe.features[i] - oVe.features[i]) * (oTe.features[i] - oVe.features[i]));
                            i++;
                        }
                        sqdistance = Math.Sqrt(distance);
                        // Znajduje k nn

                        if (iterator < k)
                        {
                            Neighbours.Add(sqdistance);
                            Nlabels.Add(oVe.label);
                            iterator++;
                        }
                        else
                        {
                            int potato = 0;
                            while (iterator > potato)
                            {
                                if (Neighbours[potato] > sqdistance)
                                {
                                    Neighbours[potato] = sqdistance;
                                    Nlabels[potato]    = oVe.label;
                                }
                                potato++;
                            }
                        }
                    }


                    /*
                     * //Pora podjac decyzje
                     * List<string> Ulabels = new List<string> { };
                     * int j = 0;
                     * foreach (string label in Nlabels)
                     * {
                     *  //musi byc petla dla labels nad tym ifem
                     *  int i = 0;
                     *  if (Ulabels.Count == 0)
                     *  {
                     *      Ulabels.Add(label);
                     *          j++;
                     *  }
                     *
                     *  while (j < Ulabels.Count)
                     *  {
                     *      if (!Ulabels[j].Equals(label))
                     *      {
                     *              Ulabels[j] = label;
                     *
                     *      }
                     *      j++;
                     *  }
                     *
                     * }
                     */
                    List <string> Ulabels = new List <string> {
                    };
                    foreach (string place in properPlaces)
                    {
                        if (Nlabels.Contains(place))
                        {
                            Ulabels.Add(place);
                        }
                    }

                    List <string> truewinner = new List <string> {
                    };
                    int x   = 0;
                    int max = 0;

                    while (x < Ulabels.Count)
                    {
                        int counter = 0;
                        int y       = 0;
                        while (y < Nlabels.Count)
                        {
                            if (Ulabels[x].Equals(Nlabels[y]))
                            {
                                counter++;
                            }
                            y++;
                        }
                        if (x == 0)
                        {
                            max = counter;
                            truewinner.Add(Ulabels[x]);
                        }
                        else if (counter > max)
                        {
                            truewinner.Clear();
                            truewinner.Add(Ulabels[x]);
                        }
                        else if ((counter == max))
                        {
                            truewinner.Add(Ulabels[x]);
                        }



                        x++;
                    }

                    // Console.WriteLine("Kraj przewidziany : " + truewinner[0] + " Kraj faktyczny : " + oTe.label + "\n");
                    // foreach (string label in Nlabels)
                    //{
                    //    Console.WriteLine(label + " | ");
                    //}

                    if (oTe.label.Equals(truewinner[0]))
                    {
                        poprawne++;

                        //if(!(oTe.label == "usa"))
                        //{
                        //Console.WriteLine("Kraj dobrze przewidziany : " + oTe.label);
                        //}
                    }

                    Predictions.Add(truewinner[0]);
                    cykle++;
                    // Console.WriteLine("Ilosc cykli: " + cykle);
                }



                accuracy = (double)poprawne / cykle;
                Console.WriteLine("Wynik : " + accuracy);
                return(accuracy);
            }

            Console.ReadKey();

            string ReplaceHexadecimalSymbols(string txt)
            {
                string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";

                return(Regex.Replace(txt, r, "", RegexOptions.Compiled));
            }
        }
Ejemplo n.º 18
0
        public Tweet CleanTweet(string tweet, bool stemmingFlag, Filter filter, int minWordLength)
        {
            List <string> tweetWords = new List <string>();
            string        userName   = "";
            DateTime      tweetDate;

            string[] words;

            // initialized with default, just in case tweet does not qualify for parsing. It is a bad approach though, but works for the time being
            Tweet currentTweet = null;

            //////           Extract tweet fregments

            // tweets without URLs are considered. Assumption: tweets with URLs are probably advertisments
            if (!ContainsURL(tweet))
            {
                string dateString = tweet.Substring(0, 29);
                tweetDate = _Util.ConvertToDate(dateString);

                // left with user name and tweet contents
                tweet = tweet.Substring(30, tweet.Length - 30);

                // spliting different parts of tweet => first item is user name and second item is tweet
                words    = tweet.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
                userName = words[0];

                // split here on all dirty characters including space
                //words = words[1].Split(dirtyCharacters, StringSplitOptions.RemoveEmptyEntries);

                words = Regex.Replace(words[1], acceptableCharactersRegex, " ").Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                bool dropFlag;

                foreach (string str in words)
                {
                    int tempNumericCheck;

                    string keyword = str.Trim().ToLower();

                    // it will also remove numbers
                    if (!IsStopWord(keyword) && !tweetWords.Contains <string>(keyword) && !int.TryParse(keyword, out tempNumericCheck))
                    {
                        dropFlag = false;

                        // only word droping filter will work here
                        // rest of the filter types will work only when all of the data is extracted
                        if (filter == Filter.WORDS)
                        {
                            if (_BagOfWords._RemoveWords.ContainsKey(keyword))
                            {
                                _BagOfWords._RemoveWords[keyword] = Convert.ToSingle(_BagOfWords._RemoveWords[keyword]) + 1f;
                                dropFlag = true;
                            }
                        }

                        if (keyword.Length < minWordLength)
                        {
                            // if length of word does not qualitfy, then simply add it in to list of stop words, and set frequency if already existed
                            if (_BagOfWords._FilteredOutWords.ContainsKey(keyword))
                            {
                                _BagOfWords._FilteredOutWords[keyword] = Convert.ToSingle(_BagOfWords._FilteredOutWords[keyword]) + 1f;
                            }
                            else
                            {
                                _BagOfWords._FilteredOutWords.Add(keyword, 1f);
                            }

                            dropFlag = true;
                        }

                        // if tweet word is not in filter word
                        if (!dropFlag)
                        {
                            if (stemmingFlag)
                            {
                                keyword = stemmer.Stem(keyword).Value;
                            }

                            _BagOfWords.Add(keyword);
                            tweetWords.Add(keyword);
                        }
                    } // end of stopword, duplicate and numeric condition check
                }     // end of foreach word in tweet

                //tweet must contain atleast on word
                if (tweetWords.Count > 0)
                {
                    _wordsUserList.AddWordsInfo(tweetWords, userName);
                    _usersWordList.AddUserInfo(userName, tweetWords);

                    currentTweet = new Tweet(tweetDate, userName, tweetWords);
                }
            } // end of URL check condition


            return(currentTweet);
        }
        /// <summary>
        /// Get list of posting
        /// </summary>
        /// <param name="index"> inverted index</param>
        /// <param name="processor">nomal token processor</param>
        /// <returns></returns>
        public IList <Posting> GetPostings(IIndex index, ITokenProcessor processor)
        {
            processor = ((NormalTokenProcessor)processor);

            //Normal proccessing of token and split them into literal by *
            string[] literals = this.token.Split("*").ToArray();
            for (int i = 0; i < literals.Length; i++)
            {
                List <string> processedToken = processor.ProcessToken(literals[i]);
                if (processedToken.Count > 0)
                {
                    if (i == 0)
                    {
                        literals[i] = "$" + processedToken[0];
                    }
                    else if (i == literals.Length - 1)
                    {
                        literals[i] = processedToken[0] + "$";
                    }
                    else
                    {
                        literals[i] = processedToken[0];
                    }
                }
            }
            literals = literals.Where(x => !string.IsNullOrEmpty(x) && x != "$").ToArray();

            //Gather candidates for each literals
            List <List <string> > candidatesList = new List <List <string> >();

            foreach (string literal in literals)
            {
                List <string> candidates = new List <String>();
                bool          didMerge   = false;
                //KGram and AND merge results for a literal
                List <string> kGramTerms = this.KGramSplitter(literal);
                foreach (string kGramTerm in kGramTerms)
                {
                    if (!didMerge)
                    {
                        candidates = candidates.Union(this.kGram.getVocabularies(kGramTerm)).ToList();
                        didMerge   = true;
                    }
                    else
                    {
                        candidates = candidates.Intersect(this.kGram.getVocabularies(kGramTerm)).ToList();
                    }
                }

                //Post filtering step
                if (candidates.Count > 0)
                {
                    //$literal*
                    if (literal.ElementAt(0) == '$' && literal.ElementAt(literal.Length - 1) != '$')
                    {
                        candidates = candidates.Where(s => s.StartsWith(literal.Substring(1))).ToList();
                    }

                    // *literal$
                    else if (literal.ElementAt(0) != '$' && literal.ElementAt(literal.Length - 1) == '$')
                    {
                        candidates = candidates.Where(s => s.EndsWith(literal.Substring(0, literal.Length - 1))).ToList();
                    }

                    // *literal*
                    else if (literal.ElementAt(0) != '$' && literal.ElementAt(literal.Length - 1) != '$')
                    {
                        candidates = candidates.Where(s => s.Contains(literal) && !s.StartsWith(literal) && !s.EndsWith(literal)).ToList();
                    }
                    candidatesList.Add(candidates);
                }
                else
                {
                    candidatesList.Add(new List <string>());
                }
            }

            //Generate the final candidates by merging candidates from all literals
            List <string> finalCandidates = new List <string>();

            for (int i = 0; i < candidatesList.Count; i++)
            {
                if (i == 0)
                {
                    finalCandidates = finalCandidates.Union(candidatesList[i]).ToList();
                }
                else
                {
                    finalCandidates = finalCandidates.Intersect(candidatesList[i]).ToList();
                }
            }

            //Stem final candidates and remove duplicate
            HashSet <string> stemmedFinalCandidates = new HashSet <string>();

            foreach (string s in finalCandidates)
            {
                stemmedFinalCandidates.Add(stemmer.Stem(s).Value);
            }

            return(index.GetPostings(stemmedFinalCandidates.ToList()));
        }