static void Stem(Sample sample) { EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); Dictionary <string, int> stemmedWords = new Dictionary <string, int>(); foreach (var word in sample.words) { var value = word.Value; var key = word.Key; var stemmedKey = stemmer.Stem(key).Value; if (stemmedWords.ContainsKey(stemmedKey)) { var valueHolder = stemmedWords.GetValueOrDefault(stemmedKey); stemmedWords.Remove(stemmedKey); stemmedWords.Add(stemmedKey, value + valueHolder); } else { stemmedWords.Add(stemmedKey, value); } } sample.words.Clear(); sample.words = stemmedWords; }
public void Stem_WithBatchData_StemsAllWordsCorrectly(string unstemmed, string expected) { var stemmer = new EnglishPorter2Stemmer(); var stemmed = stemmer.Stem(unstemmed).Value; Assert.AreEqual(expected, stemmed); }
public List <string> parseText(string text) { var charText = text.ToCharArray(); string token = string.Empty; List <string> textTokens = new List <string>(); var ep2s = new EnglishPorter2Stemmer(); for (int i = 0; i < charText.Length; i++) { char c = charText[i]; if (char.IsLetterOrDigit(c) || IsArabic(c)) //Accept english and arabic only { token += c; } else { if (!string.IsNullOrEmpty(token)) { if (token.All(ch => char.IsLetterOrDigit(ch))) { textTokens.Add(ep2s.Stem(token).Value); //stem then add } else //for arabic { textTokens.Add(token); } token = string.Empty; } } } if (!string.IsNullOrEmpty(token)) //for last token { if (token.All(ch => char.IsLetterOrDigit(ch))) { textTokens.Add(ep2s.Stem(token).Value); //stem then add } else //for arabic { textTokens.Add(token); } token = string.Empty; } return(textTokens); }
public List <string> goStemmer(string[] filteredTokens) { outList = new List <string>(); foreach (string token in filteredTokens) { outWord = toStem.Stem(token); outList.Add(outWord.Value); } return(outList); }
private string CutPatternWords(string[] words, string patternWordsType) { string output = ""; string patternSign = String.Concat(">", patternWordsType.Substring(0, 1).ToLower()); _cutWords.Add(patternSign, new LinkedList <string>()); foreach (string word in words) { if (_wordsTypes[patternWordsType].Contains(patternSign == ">v" ? stemmer.Stem(word).Value : word)) // if verb stemming it { output += patternSign + " "; _cutWords[patternSign].AddLast(patternSign == ">v" ? stemmer.Stem(word).Value : word); } else { output += word + " "; } } return(output.Trim()); }
public IActionResult Search(SearchViewModel viewModel) { var searchTokens = viewModel.search.Split(" "); List <string> stemmedTokens = new List <string>(); foreach (string s in searchTokens) { stemmedTokens.Add(ps.Stem(s).Value); } List <List <float> > links = new List <List <float> >(); foreach (string s in stemmedTokens) { JArray j = new JArray(); List <float> index = new List <float>(); try { index = invIndex.jsonData[s].ToObject <List <float> >(); } catch { continue; } links.Add(index); } List <float> commonList = new List <float>(); int ctr = 0; foreach (List <float> l in links) { if (ctr == 0) { commonList.AddRange(l); ctr++; continue; } commonList = commonList.Intersect <float>(l).ToList(); } foreach (var i in links) { foreach (float f in i) { if (!commonList.Contains(f)) { commonList.Add(f); } } } ViewData["List"] = commonList; return(View()); }
public void Stem_WithBatchData_StemsAllWordsCorrectly() { // Arrange var stemmer = new EnglishPorter2Stemmer(); var row = TestContext.DataRow; var unstemmed = row[0].ToString(); var expected = row[1].ToString(); // Act var stemmed = stemmer.Stem(unstemmed).Value; // Asssert Assert.AreEqual(expected, stemmed); }
public string stemming(string webcontent) { //the code for stemming was already implemented online. I just added the project in my solution. EnglishPorter2Stemmer stem = new EnglishPorter2Stemmer(); string[] words = webcontent.Split(' '); string stemmedwords = ""; foreach (var word in words) { stemmedwords = stemmedwords + " " + stem.Stem(word).Value; } return(stemmedwords); }
public void Stem_WithBatchData_StemsAllWordsCorrectly() { var tests = StemBatchTestCaseSource.GetTestCaseData(); foreach (var batchTestDataModel in tests) { // Arrange var stemmer = new EnglishPorter2Stemmer(); var unstemmed = batchTestDataModel.Unstemmed; var expected = batchTestDataModel.Expected; // Act var stemmed = stemmer.Stem(unstemmed).Value; // Asssert Assert.AreEqual(expected, stemmed); } }
public void Handle() { for (int k = 0; k < _documents[_documentIndex].Count; k++) { TFIDFNote tfidfNote = new TFIDFNote(); tfidfNote.Word = _documents[_documentIndex][k]; // Fill the values list for each document: // With word stemming: EnglishPorter2Stemmer englishPorter = new EnglishPorter2Stemmer(); tfidfNote.ValuesList = new List <TFIDFValue>(); for (int j = 0; j < _documents.Count; j++) { tfidfNote.ValuesList.Add( new TFIDFValue { DocumentName = _filePathList[j], Value = GetTFIDFValue(englishPorter.Stem(_documents[_documentIndex][k]), _documents[j], _documents) }); } DocumentDictionary.Add(tfidfNote); } }
public Task <string[]> Apply(string[] input) { EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); return(Task.FromResult(input?.Select(s => stemmer.Stem(s)).Where(w => !string.IsNullOrWhiteSpace(w)).ToArray())); }
public override string Stem(string word) { return(stemmer.Stem(word).Value); }
public static string Stem(string word) { return(_stemmer.Stem(word).Value); }
public void optimizeQueryTokens(ref List <string> tokens, bool INSERT_AND = false) { var ep2s = new EnglishPorter2Stemmer(); if (_Algorithm != Algorithm.TFIDFSearchModel) { for (int i = 0; i < tokens.Count; i++) { if (tokens[i] != "(" && tokens[i] != ")" && tokens[i] != "NOT" && tokens[i] != "AND" && tokens[i] != "OR") { //Insert Missinng ORs if (i + 1 < tokens.Count) { if (tokens[i + 1] != "(" && tokens[i + 1] != ")" && tokens[i + 1] != "AND" && tokens[i + 1] != "OR") { if (INSERT_AND) { tokens.Insert(i + 1, "AND"); } else { tokens.Insert(i + 1, "OR"); } } } //Stem Tokens using porter2Stemmer if (tokens[i].All(ch => char.IsLetterOrDigit(ch))) //if english { tokens[i] = ep2s.Stem(tokens[i].ToLower()).Value; //stem then add } } else if (tokens[i] == ")") { if (i + 1 < tokens.Count) { if (tokens[i + 1] != ")" && tokens[i + 1] != "AND" && tokens[i + 1] != "OR") { if (INSERT_AND) { tokens.Insert(i + 1, "AND"); } else { tokens.Insert(i + 1, "OR"); } } } } } } else // if(_Algorithm == Algorithm.TFIDFSearchModel) { for (int i = 0; i < tokens.Count; i++) { //Stem Tokens using porter2Stemmer if (tokens[i].All(ch => char.IsLetterOrDigit(ch))) //if english { tokens[i] = ep2s.Stem(tokens[i].ToLower()).Value; //stem then add } } } }
protected void Button1_Click(object sender, EventArgs e) { String query = Query.Text; Mean.Text = ""; Results.Items.Clear(); NonResults.Items.Clear(); Suggested.Items.Clear(); Proximity.Items.Clear(); if (query.Length == 0) { return; } queryTerms = new List <String>(); StemmedQueryTerms = new List <String>(); TE = new IRTest_Entities(); bool Spelling = Spell.Checked; bool Soundex = Sound.Checked; bool ExactSearch = true; if (query[0] != '"') { ExactSearch = false; } //Retrieving Query Terms int Begin = 0; int End = query.Length; String Term = ""; if (ExactSearch == true) { Begin++; End--; } for (int i = Begin; i < End; i++) { if (query[i] == ' ') { if (Term.Length > 0) { String TTerm = ""; for (int j = 0; j < Term.Length; j++) { TTerm += Char.ToLower(Term[j]); } queryTerms.Add(TTerm); EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); String STTERM = stemmer.Stem(TTerm).Value; StemmedQueryTerms.Add(STTERM); } Term = ""; } else { Term += query[i]; if (i == End - 1) { String TTerm = ""; for (int j = 0; j < Term.Length; j++) { TTerm += Char.ToLower(Term[j]); } queryTerms.Add(TTerm); EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); String STTERM = stemmer.Stem(TTerm).Value; StemmedQueryTerms.Add(STTERM); } } } //Spelling Correction if (Spelling == true) { String CorrectedWords = ""; for (int i = 0; i < queryTerms.Count; i++) { List <String> SimilarWords = new List <String>(); String Tquery = "$"; Tquery += queryTerms[i]; Tquery += "$"; for (int j = 0; j < Tquery.Length - 1; j++) { String Gram = Tquery.Substring(j, 2); List <String> CandidateTerms = new List <String>(); foreach (Bigram BG in TE.Bigrams) { if (Gram.Equals(BG.gram)) { String DictionaryTerms = BG.terms; String Temp = ""; for (int u = 0; u < DictionaryTerms.Length; u++) { if (DictionaryTerms[u] == ' ') { if (Temp.Length > 0) { String TTemp = "$"; TTemp += Temp; TTemp += "$"; CandidateTerms.Add(TTemp); } Temp = ""; } else { Temp += DictionaryTerms[u]; if (u == DictionaryTerms.Length - 1) { String TTemp = "$"; TTemp += Temp; TTemp += "$"; CandidateTerms.Add(TTemp); } } } break; } } for (int u = 0; u < CandidateTerms.Count; u++) { String Candidate = CandidateTerms[u]; double CommonGrams = 0; double QueryGrams = Tquery.Length - 1; double TermGrams = Candidate.Length - 1; for (int f = 0; f < Tquery.Length - 1; f++) { for (int ff = 0; ff < Candidate.Length - 1; ff++) { if (Tquery.Substring(f, 2).Equals(Candidate.Substring(ff, 2))) { CommonGrams++; break; } } } double Jaccard = (2.0 * CommonGrams) / (QueryGrams + TermGrams); Jaccard *= 100.0; if (Jaccard >= 45) { if (!(SimilarWords.Contains(Candidate))) { SimilarWords.Add(Candidate); } } } } List <int> Distances = new List <int>(); for (int f = 0; f < SimilarWords.Count; f++) { String Temp = ""; for (int ff = 1; ff < SimilarWords[f].Length - 1; ff++) { Temp += SimilarWords[f][ff]; } int _EditDistance = EditDistance(queryTerms[i], Temp); Distances.Add(_EditDistance); } //Sorting for (int write = 0; write < SimilarWords.Count; write++) { for (int sort = 0; sort < SimilarWords.Count - 1; sort++) { if (Distances[sort] > Distances[sort + 1]) { int temp = Distances[sort + 1]; Distances[sort + 1] = Distances[sort]; Distances[sort] = temp; String TSTR = SimilarWords[sort + 1]; SimilarWords[sort + 1] = SimilarWords[sort]; SimilarWords[sort] = TSTR; } } } if (SimilarWords.Count > 0) { CorrectedWords += SimilarWords[0].Substring(1, SimilarWords[0].Length - 2); } else { CorrectedWords += "NULL"; } if (i < queryTerms.Count - 1) { CorrectedWords += ' '; } for (int f = 0; f < SimilarWords.Count; f++) { Suggested.Items.Add(SimilarWords[f].Substring(1, SimilarWords[f].Length - 2)); } } Mean.Text = CorrectedWords; } //Soundex if (Soundex == true) { List <int> DOCS = new List <int>(); List <int> FREQS = new List <int>(); String Code = ComputeSoundex(queryTerms[0]); foreach (SoundCode SC in TE.SoundCodes) { if (Code.Equals(SC.code)) { String PTerms = SC.terms; String Temp2 = ""; for (int i = 0; i < PTerms.Length; i++) { if (PTerms[i] == ' ') { if (Temp2.Length > 0) { EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); String STEMMED = stemmer.Stem(Temp2).Value; foreach (II_Stemming II in TE.II_Stemming) { if (STEMMED == II.name) { int _docid = (int)II.docid; int _frequency = (int)II.frequency; DOCS.Add(_docid); FREQS.Add(_frequency); } } } Temp2 = ""; } else { Temp2 += PTerms[i]; if (i == PTerms.Length - 1) { EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); String STEMMED = stemmer.Stem(Temp2).Value; foreach (II_Stemming II in TE.II_Stemming) { if (STEMMED == II.name) { int _docid = (int)II.docid; int _frequency = (int)II.frequency; DOCS.Add(_docid); FREQS.Add(_frequency); } } } } } break; } } //Sorting for (int write = 0; write < FREQS.Count; write++) { for (int sort = 0; sort < FREQS.Count - 1; sort++) { if (FREQS[sort] < FREQS[sort + 1]) { int temp = FREQS[sort + 1]; FREQS[sort + 1] = FREQS[sort]; FREQS[sort] = temp; int TINT = DOCS[sort + 1]; DOCS[sort + 1] = DOCS[sort]; DOCS[sort] = TINT; } } } List <String> URLS = new List <String>(); foreach (EnglishPage EP in TE.EnglishPages.ToList()) { URLS.Add(EP.URL); } for (int i = 0; i < DOCS.Count; i++) { Results.Items.Add(URLS[DOCS[i] - 1]); Proximity.Items.Add(FREQS[i].ToString()); } //for (int i = 0; i < SimilarWords.Count; i++) Results.Items.Add(SimilarWords[i]); } else if (queryTerms.Count > 0) { TERMS = new Dictionary <String, Dictionary <int, List <int> > >(); DBNames = new List <String>(); List <String> URLS = new List <String>(); foreach (EnglishPage EP in TE.EnglishPages.ToList()) { URLS.Add(EP.URL); } foreach (II_Stemming II in TE.II_Stemming.ToList()) { DBNames.Add(II.name); if (StemmedQueryTerms.Contains(II.name) == true) { if (TERMS.ContainsKey(II.name) == true) { if (TERMS[II.name].ContainsKey((int)II.docid) == false) { String POS = II.positions; int C = 0; List <int> POSS = new List <int>(); for (int i = 1; i < POS.Length; i++) { if (POS[i] == ' ') { POSS.Add(C); C = 0; } else { C = (C * 10) + ((int)(POS[i] - '0')); if (i == POS.Length - 1) { POSS.Add(C); } } } TERMS[II.name].Add((int)II.docid, POSS); } } else { TERMS.Add(II.name, new Dictionary <int, List <int> >()); String POS = II.positions; int C = 0; List <int> POSS = new List <int>(); for (int i = 1; i < POS.Length; i++) { if (POS[i] == ' ') { POSS.Add(C); C = 0; } else { C = (C * 10) + ((int)(POS[i] - '0')); if (i == POS.Length - 1) { POSS.Add(C); } } } TERMS[II.name].Add((int)II.docid, POSS); } } } if (ExactSearch == true) { DOCS_EXACT = new Dictionary <int, int>(); int[] DOCS_COUNT = new int[1610]; List <int> NON_CANDIDATE_DOCS = new List <int>(); foreach (KeyValuePair <String, Dictionary <int, List <int> > > EE in TERMS) { foreach (KeyValuePair <int, List <int> > EE2 in EE.Value) { DOCS_COUNT[EE2.Key]++; } } for (int i = 1; i <= 1600; i++) { if (DOCS_COUNT[i] < StemmedQueryTerms.Count && DOCS_COUNT[i] > 0) { NON_CANDIDATE_DOCS.Add(i); } } bool TR = SOLVE(0, queryTerms.Count, 0, 0); List <int> DOCS_ = new List <int>(); List <int> Freqs_ = new List <int>(); foreach (KeyValuePair <int, int> EE in DOCS_EXACT) { DOCS_.Add(EE.Key); Freqs_.Add(EE.Value); } //sorting for (int write = 0; write < Freqs_.Count; write++) { for (int sort = 0; sort < Freqs_.Count - 1; sort++) { if (Freqs_[sort] < Freqs_[sort + 1]) { int temp = Freqs_[sort + 1]; Freqs_[sort + 1] = Freqs_[sort]; Freqs_[sort] = temp; int TINT = DOCS_[sort + 1]; DOCS_[sort + 1] = DOCS_[sort]; DOCS_[sort] = TINT; } } } for (int i = 0; i < DOCS_.Count; i++) { Results.Items.Add(URLS[DOCS_[i] - 1]); Proximity.Items.Add(Freqs_[i].ToString()); } //NON COMMON RESULTS for (int i = 0; i < NON_CANDIDATE_DOCS.Count; i++) { NonResults.Items.Add(URLS[NON_CANDIDATE_DOCS[i] - 1]); } } else { DOCS_RANDOM = new List <int>(); DOCS_DIS = new List <int>(); NON_CANDIDATE = new List <int>(); SOLVE_RANDOM(queryTerms.Count); for (int i = 0; i < DOCS_RANDOM.Count; i++) { Results.Items.Add(URLS[DOCS_RANDOM[i] - 1]); Proximity.Items.Add(DOCS_DIS[i].ToString()); } //NON COMMON RESULTS for (int i = 0; i < NON_CANDIDATE.Count; i++) { NonResults.Items.Add(URLS[NON_CANDIDATE[i] - 1]); } } } }
static void Main(string[] args) { List <string> properPlaces = new List <string> { "usa", "west-germany", "france", "uk", "japan", "canada" }; List <string> myPlaces = new List <string> { }; List <string> myBody = new List <string> { }; List <TestObj> Testowe = new List <TestObj> { }; DirectoryInfo d = new DirectoryInfo(@"E:\Pulpit\1111111"); //Assuming Test is your Folder FileInfo[] Files = d.GetFiles("*.sgm"); //Getting Text files int o = 0; foreach (FileInfo file in Files) { const Int32 BufferSize = 128; //string path = @"E:\Pulpit\1111111\reut2-001.sgm"; using (var fileStream = File.OpenRead(file.FullName)) using (var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize)) { //int i = 0; String line; String wynik = "";; line = streamReader.ReadLine(); while ((line = streamReader.ReadLine()) != null) { //Podmieniam znaki specjalne na takie ktore nie wchodza w konflikt z xml wynik += ReplaceHexadecimalSymbols(line); //if (line.Contains("<PLACES>")) //{ } //Console.WriteLine(line); //i++; //if (i > 20) //{ break; } } var Myreplacedxml = "<root>" + wynik + "</root>"; //wynik += ("<root>" + wynik + "</root>"); XmlSerializer Serializer = new XmlSerializer(typeof(root)); root result; using (TextReader reader = new StringReader(Myreplacedxml)) result = (root)Serializer.Deserialize(reader); foreach (rootREUTERS rootREUTER in result.REUTERS) { if (rootREUTER.PLACES.Length != 1) { continue; } else if (!properPlaces.Contains(rootREUTER.PLACES[0])) { continue; } else { if (!(rootREUTER.TEXT.BODY is null)) { myPlaces.Add(rootREUTER.PLACES[0]); myBody.Add(rootREUTER.TEXT.BODY); //Console.WriteLine(rootREUTER.TEXT.BODY); } //Console.WriteLine(rootREUTER.PLACES[0]); } } //Console.ReadKey(); // Process line } //odczyt jenego pliku //o++; //if(o > 9) { // break; Console.WriteLine("Processing " + file.FullName + "..."); // } } EnglishPorter2Stemmer stemmer = new EnglishPorter2Stemmer(); List <int> feature2 = new List <int> { }; //int i = 0; //while (myBody[i] != null) //Tutaj testuje stemowanie i wybieranie + wpisywanie cech do listyw obiekcie List <string> allwords = new List <string> { }; Dictionary <string, int> wordsDictionary = new Dictionary <string, int>(); for (int i = 0; i < myBody.Count; i++) { char[] separator = { '.', ',', ' ', '\t', '"', '=', '-', '<', '>', ')', '(', ';' }; string[] worlds = myBody[i].Split(separator); List <double> feature1 = new List <double> { }; string pom; double world7 = 0; double worldS = 0; //int world5 = 0; foreach (string world in worlds) { pom = stemmer.Stem(world).Value; if (!wordsDictionary.ContainsKey(pom)) { wordsDictionary.Add(pom, 1); } else { wordsDictionary[pom]++; } //Console.WriteLine(pom); //Zliczam ilosc slow dluzszych powyzej 7 znakow } } foreach (KeyValuePair <string, int> kvp in wordsDictionary) { if (kvp.Value > 10) { allwords.Add(kvp.Key); } } //double w1 = worldS; //double w2 = world7; //double normal = Math.Sqrt((w1*w1) + (w2*w2)); //world7 = world7 / normal; //worldS = worldS / normal; // feature1.Add(worldS); // feature1.Add(world7); // Testowe.Add(new TestObj(myPlaces[i], feature1)); //Dodaje cechy i tworze obiekty for (int i = 0; i < myBody.Count; i++) { string[] worlds = myBody[i].Split(' '); List <double> feature1 = new List <double> { }; string pom; for (int j = 0; j < worlds.Length; j++) { pom = stemmer.Stem(worlds[j]).Value; worlds[j] = pom; } foreach (string world in allwords) { //allwords przechowuje wystąpienia słowa, myBody.Count to liczba dokumentów double counter = 0; if (worlds.Contains(world)) { foreach (string wrd in worlds) { if (wrd.Equals(world)) { counter++; } } //tf idf basically, myBody.Count to liczba obiektow aworldsDictionary przechowuje wystapienia slow w calym zbiorze double pomocy = (double)myBody.Count / (double)wordsDictionary[world]; counter = counter * Math.Log(pomocy); feature1.Add(counter); } else { feature1.Add(0); } } Testowe.Add(new TestObj(myPlaces[i], feature1)); } // Nie jest kolorowo // zapisuje do pliku formatu csv żeby potem przetworzyć w pythonie /* * using (System.IO.StreamWriter file = * new System.IO.StreamWriter(@"E:\Pulpit\Reuters_reduce_usa.csv")) * { * string LineToWrite = "labels,"; * foreach (string line in allwords) * { * * LineToWrite += (line + ","); * * * * } * * file.WriteLine(LineToWrite); * int liczusa = 0; * foreach (TestObj testObj in Testowe) * { * if ((testObj.label.Equals("usa"))) { liczusa++; } * if ((liczusa == 10) || (!testObj.label.Equals("usa"))) * { * LineToWrite = (testObj.label + ","); * foreach(int feature in testObj.features) * { * LineToWrite += feature.ToString() + ","; * } * file.WriteLine(LineToWrite); * if(liczusa == 10) { liczusa = 0; } * * } * } * * } * Console.WriteLine("Zapisywanie ukończone"); */ // Moje piękne wywołanie algorytmu KNN(10, Testowe, 5); double KNN(int k, List <TestObj> objs, int odsetek) { double accuracy = 0; List <string> Predictions = new List <string> { }; List <TestObj> Tests = new List <TestObj> { }; List <TestObj> Verification = new List <TestObj> { }; //dzielę zbior na testowy i testowany //TODO zrobic mozliwosc wyboru odsetka ele w zb testowym int DeleteUsa = 0; int cykle = 0; int poprawne = 0; int licz = 1; foreach (TestObj obj in objs) { //Dwa ify do usuwania 9/10 usa if (obj.label.Equals("usa")) { DeleteUsa++; } if ((DeleteUsa == 10) || (!obj.label.Equals("usa"))) { if (licz <= odsetek) { Verification.Add(obj); licz++; } else if ((licz > odsetek) && (licz < 10)) { Tests.Add(obj); licz++; } else { Tests.Add(obj); licz = 1; } DeleteUsa = 0; } } Console.WriteLine(Tests.Count + " | " + Verification.Count); //Tu sie bedzie dzialo foreach (TestObj oTe in Tests) { List <double> Neighbours = new List <double> { }; List <string> Nlabels = new List <string> { }; double sqdistance; int iterator = 0; foreach (TestObj oVe in Verification) { double distance = 0; int i = 0; while (i < oVe.features.Count) { distance += ((oTe.features[i] - oVe.features[i]) * (oTe.features[i] - oVe.features[i])); i++; } sqdistance = Math.Sqrt(distance); // Znajduje k nn if (iterator < k) { Neighbours.Add(sqdistance); Nlabels.Add(oVe.label); iterator++; } else { int potato = 0; while (iterator > potato) { if (Neighbours[potato] > sqdistance) { Neighbours[potato] = sqdistance; Nlabels[potato] = oVe.label; } potato++; } } } /* * //Pora podjac decyzje * List<string> Ulabels = new List<string> { }; * int j = 0; * foreach (string label in Nlabels) * { * //musi byc petla dla labels nad tym ifem * int i = 0; * if (Ulabels.Count == 0) * { * Ulabels.Add(label); * j++; * } * * while (j < Ulabels.Count) * { * if (!Ulabels[j].Equals(label)) * { * Ulabels[j] = label; * * } * j++; * } * * } */ List <string> Ulabels = new List <string> { }; foreach (string place in properPlaces) { if (Nlabels.Contains(place)) { Ulabels.Add(place); } } List <string> truewinner = new List <string> { }; int x = 0; int max = 0; while (x < Ulabels.Count) { int counter = 0; int y = 0; while (y < Nlabels.Count) { if (Ulabels[x].Equals(Nlabels[y])) { counter++; } y++; } if (x == 0) { max = counter; truewinner.Add(Ulabels[x]); } else if (counter > max) { truewinner.Clear(); truewinner.Add(Ulabels[x]); } else if ((counter == max)) { truewinner.Add(Ulabels[x]); } x++; } // Console.WriteLine("Kraj przewidziany : " + truewinner[0] + " Kraj faktyczny : " + oTe.label + "\n"); // foreach (string label in Nlabels) //{ // Console.WriteLine(label + " | "); //} if (oTe.label.Equals(truewinner[0])) { poprawne++; //if(!(oTe.label == "usa")) //{ //Console.WriteLine("Kraj dobrze przewidziany : " + oTe.label); //} } Predictions.Add(truewinner[0]); cykle++; // Console.WriteLine("Ilosc cykli: " + cykle); } accuracy = (double)poprawne / cykle; Console.WriteLine("Wynik : " + accuracy); return(accuracy); } Console.ReadKey(); string ReplaceHexadecimalSymbols(string txt) { string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]"; return(Regex.Replace(txt, r, "", RegexOptions.Compiled)); } }
public Tweet CleanTweet(string tweet, bool stemmingFlag, Filter filter, int minWordLength) { List <string> tweetWords = new List <string>(); string userName = ""; DateTime tweetDate; string[] words; // initialized with default, just in case tweet does not qualify for parsing. It is a bad approach though, but works for the time being Tweet currentTweet = null; ////// Extract tweet fregments // tweets without URLs are considered. Assumption: tweets with URLs are probably advertisments if (!ContainsURL(tweet)) { string dateString = tweet.Substring(0, 29); tweetDate = _Util.ConvertToDate(dateString); // left with user name and tweet contents tweet = tweet.Substring(30, tweet.Length - 30); // spliting different parts of tweet => first item is user name and second item is tweet words = tweet.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); userName = words[0]; // split here on all dirty characters including space //words = words[1].Split(dirtyCharacters, StringSplitOptions.RemoveEmptyEntries); words = Regex.Replace(words[1], acceptableCharactersRegex, " ").Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); bool dropFlag; foreach (string str in words) { int tempNumericCheck; string keyword = str.Trim().ToLower(); // it will also remove numbers if (!IsStopWord(keyword) && !tweetWords.Contains <string>(keyword) && !int.TryParse(keyword, out tempNumericCheck)) { dropFlag = false; // only word droping filter will work here // rest of the filter types will work only when all of the data is extracted if (filter == Filter.WORDS) { if (_BagOfWords._RemoveWords.ContainsKey(keyword)) { _BagOfWords._RemoveWords[keyword] = Convert.ToSingle(_BagOfWords._RemoveWords[keyword]) + 1f; dropFlag = true; } } if (keyword.Length < minWordLength) { // if length of word does not qualitfy, then simply add it in to list of stop words, and set frequency if already existed if (_BagOfWords._FilteredOutWords.ContainsKey(keyword)) { _BagOfWords._FilteredOutWords[keyword] = Convert.ToSingle(_BagOfWords._FilteredOutWords[keyword]) + 1f; } else { _BagOfWords._FilteredOutWords.Add(keyword, 1f); } dropFlag = true; } // if tweet word is not in filter word if (!dropFlag) { if (stemmingFlag) { keyword = stemmer.Stem(keyword).Value; } _BagOfWords.Add(keyword); tweetWords.Add(keyword); } } // end of stopword, duplicate and numeric condition check } // end of foreach word in tweet //tweet must contain atleast on word if (tweetWords.Count > 0) { _wordsUserList.AddWordsInfo(tweetWords, userName); _usersWordList.AddUserInfo(userName, tweetWords); currentTweet = new Tweet(tweetDate, userName, tweetWords); } } // end of URL check condition return(currentTweet); }
/// <summary> /// Get list of posting /// </summary> /// <param name="index"> inverted index</param> /// <param name="processor">nomal token processor</param> /// <returns></returns> public IList <Posting> GetPostings(IIndex index, ITokenProcessor processor) { processor = ((NormalTokenProcessor)processor); //Normal proccessing of token and split them into literal by * string[] literals = this.token.Split("*").ToArray(); for (int i = 0; i < literals.Length; i++) { List <string> processedToken = processor.ProcessToken(literals[i]); if (processedToken.Count > 0) { if (i == 0) { literals[i] = "$" + processedToken[0]; } else if (i == literals.Length - 1) { literals[i] = processedToken[0] + "$"; } else { literals[i] = processedToken[0]; } } } literals = literals.Where(x => !string.IsNullOrEmpty(x) && x != "$").ToArray(); //Gather candidates for each literals List <List <string> > candidatesList = new List <List <string> >(); foreach (string literal in literals) { List <string> candidates = new List <String>(); bool didMerge = false; //KGram and AND merge results for a literal List <string> kGramTerms = this.KGramSplitter(literal); foreach (string kGramTerm in kGramTerms) { if (!didMerge) { candidates = candidates.Union(this.kGram.getVocabularies(kGramTerm)).ToList(); didMerge = true; } else { candidates = candidates.Intersect(this.kGram.getVocabularies(kGramTerm)).ToList(); } } //Post filtering step if (candidates.Count > 0) { //$literal* if (literal.ElementAt(0) == '$' && literal.ElementAt(literal.Length - 1) != '$') { candidates = candidates.Where(s => s.StartsWith(literal.Substring(1))).ToList(); } // *literal$ else if (literal.ElementAt(0) != '$' && literal.ElementAt(literal.Length - 1) == '$') { candidates = candidates.Where(s => s.EndsWith(literal.Substring(0, literal.Length - 1))).ToList(); } // *literal* else if (literal.ElementAt(0) != '$' && literal.ElementAt(literal.Length - 1) != '$') { candidates = candidates.Where(s => s.Contains(literal) && !s.StartsWith(literal) && !s.EndsWith(literal)).ToList(); } candidatesList.Add(candidates); } else { candidatesList.Add(new List <string>()); } } //Generate the final candidates by merging candidates from all literals List <string> finalCandidates = new List <string>(); for (int i = 0; i < candidatesList.Count; i++) { if (i == 0) { finalCandidates = finalCandidates.Union(candidatesList[i]).ToList(); } else { finalCandidates = finalCandidates.Intersect(candidatesList[i]).ToList(); } } //Stem final candidates and remove duplicate HashSet <string> stemmedFinalCandidates = new HashSet <string>(); foreach (string s in finalCandidates) { stemmedFinalCandidates.Add(stemmer.Stem(s).Value); } return(index.GetPostings(stemmedFinalCandidates.ToList())); }