public void createInvertedIndex(string data, int domain, int infID, int InfDetID) { List <string> words = Regex.Split(data, @"[^A-Za-z0-9]").Where(i => i != string.Empty).ToList(); List <string> stopwords = Lingkungan.getStopWordList(); List <Term> InvertedIndex = Lingkungan.LoadInvertedIndex(); if (InvertedIndex == null) { InvertedIndex = new List <Term>(); } Term kata = null; for (int i = 0; i < words.Count; i++) { kata = InvertedIndex.Where(x => x.Word.ToLower().Equals(words[i].ToLower())).FirstOrDefault(); if (kata == null) { kata = new Term(); kata.Word = words[i].ToLower(); kata.Jenis = JenisKata.Unknown; InvertedIndex.Add(kata); } if (kata.Jenis == JenisKata.Unknown) { if (InetCon) { ScraptDataFromWebKBBI(kata.Word); } if (File.Exists(Lingkungan.getDataCacheKata() + kata.Word + ".html")) { kata.Jenis = GetJenisKataFromScraptFile(kata.Word); } } if (stopwords != null && stopwords.Where(x => x.ToLower().Equals(words[i].ToLower())).Count() > 0) { kata.StopWord = true; } else { kata.StopWord = false; } kata.Index.Add(new Location(domain, infID, InfDetID, i)); } Lingkungan.SaveInvertedIndex(InvertedIndex); //hitung pembobotan }
public bool ScraptDataFromWebKBBI(string kata) { try { string url = "http://kbbi.web.id/" + kata; // string cachePath = ImportantLocation.getWordScrapedCacheLocation(); string cachePath = Lingkungan.getDataCacheKata(); string Data = ""; HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(url); HttpWebResponse Response = (HttpWebResponse)Request.GetResponse(); if (Request != null && Response != null) { Stream receiveStream = Response.GetResponseStream(); StreamReader ReaderStream = null; if (Response.CharacterSet == null) { ReaderStream = new StreamReader(receiveStream); } else { ReaderStream = new StreamReader(receiveStream, Encoding.GetEncoding(Response.CharacterSet)); } Data = ReaderStream.ReadToEnd(); ReaderStream.Close(); System.IO.FileInfo file = new System.IO.FileInfo(cachePath + kata.ToLower() + ".html"); file.Directory.Create(); // If the directory already exists, this method does nothing. System.IO.File.WriteAllText(file.FullName, Data); } return(true); } catch (Exception e) { // System.Windows.Forms.MessageBox.Show(e.Message); return(false); } }
public JenisKata GetJenisKataFromScraptFile(string kata) { JenisKata retur = JenisKata.Unknown; HtmlAgilityPack.HtmlDocument htmldoc = new HtmlAgilityPack.HtmlDocument(); string location = Lingkungan.getDataCacheKata() + kata.ToLower() + ".html"; htmldoc.Load(location); List <string> toParse2 = new List <string>(); try { foreach (HtmlNode node in htmldoc.DocumentNode.SelectNodes("//textarea[@id='jsdata']")) { toParse2.AddRange(Regex.Split(node.ChildNodes[0].InnerHtml, @"[^A-Za-z0-9]").Where(i => i != string.Empty).ToList()); } int x = 0; for (int i = 0; i < toParse2.Count - 1; i++) { if (toParse2[i].ToLower().Equals(kata.ToLower())) { x = i; break; } } if (x > 0) { for (int j = x; j < toParse2.Count - 2; j++) { if (toParse2[j - 2].ToLower().Equals("em") && toParse2[j - 2].ToLower().Equals("em")) { if (toParse2[j].ToLower().Equals("n")) { retur = JenisKata.Benda; break; } else if (toParse2[j].ToLower().Equals("v")) { retur = JenisKata.Kerja; break; } else if (toParse2[j].ToLower().Equals("a")) { retur = JenisKata.Sifat; break; } else if (toParse2[j].ToLower().Equals("pron")) { retur = JenisKata.Ganti; break; } else if (toParse2[j].ToLower().Equals("adv")) { retur = JenisKata.Keterangan; break; } else if (toParse2[j].ToLower().Equals("p")) { retur = JenisKata.Tugas; break; } else if (toParse2[j].ToLower().Equals("num")) { retur = JenisKata.Bilangan; break; } //else if (toParse2[j].ToLower().Equals("aa")) //{ // retur = JenisKata.Ganti; // break; //} } } } return(retur); } catch (Exception) { return(retur); } }