Ejemplo n.º 1
0
        private void btnOpen_Click(object sender, EventArgs e)
        {
            searchSet.DocumentsDataTable docTable = new searchSet.DocumentsDataTable();
            searchSetTableAdapters.DocumentsTableAdapter docAdapter = new searchSetTableAdapters.DocumentsTableAdapter();

            docTable = docAdapter.GetDataByID(Utility.getDocumentID(listResults.Items[listResults.SelectedIndex].ToString()));
            MessageBox.Show(docTable[0].Title + "\n\n" + docTable[0].Authors + "\n\n" + docTable[0].Content);
        }
Ejemplo n.º 2
0
        public void openDocumentFile(string pathName)
        {
            searchSetTableAdapters.DocumentsTableAdapter documentTableAdapter = new searchSetTableAdapters.DocumentsTableAdapter();
            searchSetTableAdapters.Document_TermTableAdapter documentTermTableAdapter = new searchSetTableAdapters.Document_TermTableAdapter();
            searchSetTableAdapters.TermsTableAdapter termTableAdapter = new searchSetTableAdapters.TermsTableAdapter();

            documentTableAdapter.DeleteAll();
            documentTermTableAdapter.DeleteAll();
            termTableAdapter.DeleteAll();

            searchSet.DocumentsRow doc;
            System.IO.StreamReader file = new System.IO.StreamReader(pathName);
            string line;
            int max_term;
            char nowState = ' ';
            int number = 0;
            StringBuilder titleBuilder = new StringBuilder("");
            string title = "";
            string authors = "";
            string content = "";
            StringBuilder contentBuilder = new StringBuilder("");

            Dictionary<string, int> termOccurenceInDocument = new Dictionary<string, int>();

            int jumlah_dokumen = 0;

            while ((line = file.ReadLine()) != null)
            {
                if (line.Length > 1)
                {
                    if (line[0] == '.')
                    {
                        if (line[1] == 'I')
                        {
                            //if (nowState == 'W')
                            //{
                                title = titleBuilder.ToString();
                                content = contentBuilder.ToString().Trim();

                                StringBuilder allContentsInDocument = new StringBuilder("");
                                allContentsInDocument.Append(title);
                                allContentsInDocument.Append(" ");
                                allContentsInDocument.Append(content);
                                allContentsInDocument.Append(" ");
                                allContentsInDocument.Append(authors);

                                List<String> splittedTerms = new List<String>();
                                splittedTerms = Utility.separateData(allContentsInDocument.ToString().ToLower());
                                splittedTerms = splittedTerms.Where(s => !string.IsNullOrWhiteSpace(s)).ToList();

                                doc = documentDataTable.NewDocumentsRow();
                                doc.ID = number;
                                doc.Title = title;
                                doc.Content = content;
                                doc.Authors = authors;

                                documentDataTable.AddDocumentsRow(doc);

                                //documentTableAdapter.Insert(number, title, content, authors.ToString());

                                splittedTerms = Utility.removeStopWord(splittedTerms);
                                if (config.stemmingOption == 1)
                                {
                                    splittedTerms = Utility.stemming(splittedTerms);
                                }

                                max_term = 0;
                                termOccurenceInDocument = new Dictionary<string, int>();
                                foreach (string s in splittedTerms)
                                {
                                    if (!termOccurenceInDocument.ContainsKey(s))
                                    {
                                        termOccurenceInDocument[s] = 1;
                                        if (!termOccurenceInCollection.ContainsKey(s))
                                            termOccurenceInCollection[s] = 1;
                                        else termOccurenceInDocument[s]++;
                                    }
                                    else
                                        termOccurenceInDocument[s]++;

                                    if (max_term < termOccurenceInDocument[s]) max_term = termOccurenceInDocument[s];
                                }

                                //documentTermDataTable = new searchSet.Document_TermDataTable();

                                foreach (KeyValuePair<string, int> entry in termOccurenceInDocument)
                                {
                                    searchSet.TermsRow term = termsDataTable.NewTermsRow();
                                    term.Term = entry.Key;
                                    term.IDF = 1;
                                    try
                                    {
                                        termsDataTable.AddTermsRow(term);
                                        //termTableAdapter.Insert(entry.Key, 0);
                                    }
                                    catch (Exception x)
                                    {

                                    }

                                    searchSet.Document_TermRow dterm = documentTermDataTable.NewDocument_TermRow();
                                    dterm.Term = entry.Key;
                                    dterm.Document_ID = number;
                                    dterm.Weight = calculateTF(entry.Value, max_term);

                                    documentTermDataTable.AddDocument_TermRow(dterm);
                                    //documentTermTableAdapter.Insert(entry.Key, number, entry.Value);
                                }

                                jumlah_dokumen++;
                                progressReporter.ReportProgress(jumlah_dokumen);

                                //termTableAdapter.Update(termsDataTable);
                                //documentTermTableAdapter.Update(documentTermDataTable);

                            //}

                            nowState = 'I';
                            number = Int32.Parse(line.Split(' ')[1]);

                            content = "";
                            authors = "";
                            contentBuilder = new StringBuilder("");
                            titleBuilder = new StringBuilder("");
                        }
                        else if (line[1] == 'T')
                        {
                            nowState = 'T';
                            titleBuilder = new StringBuilder("");
                        }
                        else if (line[1] == 'A')
                        {
                            if (nowState != 'A')
                            {
                                title = titleBuilder.ToString().Trim();
                            }
                            nowState = 'A';
                            authors = "";
                        }
                        else if (line[1] == 'W')
                        {
                            nowState = 'W';
                            contentBuilder = new StringBuilder("");
                        }
                    }
                    else
                    {
                        if (nowState == 'T')
                        {
                            titleBuilder.Append(line);
                            titleBuilder.Append(" ");
                        }
                        else if (nowState == 'A')
                        {
                            if (authors.Length > 0) authors += ", ";
                            authors += line.Trim();
                        }
                        else if (nowState == 'W')
                        {
                            contentBuilder.Append(line);
                            contentBuilder.Append(" ");
                        }
                    }
                }
            }
            file.Close();

            // last document
            content = contentBuilder.ToString().Trim();
            title = titleBuilder.ToString();

            StringBuilder allContentsInDoc = new StringBuilder("");
            allContentsInDoc.Append(title);
            allContentsInDoc.Append(" ");
            allContentsInDoc.Append(content);

            allContentsInDoc.Append(" ");
            allContentsInDoc.Append(authors);

            List<String> splitTerm = new List<String>();
            splitTerm = Utility.separateData(allContentsInDoc.ToString().ToLower());
            splitTerm = splitTerm.Where(s => !string.IsNullOrWhiteSpace(s)).ToList();

            //////////// update doc
            //documentTableAdapter.Update(documentDataTable);
            doc = documentDataTable.NewDocumentsRow();
            doc.ID = number;
            doc.Title = title;
            doc.Content = content;
            doc.Authors = authors;

            documentDataTable.AddDocumentsRow(doc);

            jumlah_dokumen++;
            progressReporter.ReportProgress(jumlah_dokumen);

            //documentTableAdapter.Insert(number, title, content, authors.ToString());

            splitTerm = Utility.removeStopWord(splitTerm);
            if (config.stemmingOption == 1)
            {
                splitTerm = Utility.stemming(splitTerm);
            }

            max_term = 0;

            termOccurenceInDocument = new Dictionary<string, int>();
            foreach (string s in splitTerm)
            {
                if (!termOccurenceInDocument.ContainsKey(s))
                {
                    termOccurenceInDocument[s] = 1;
                    if (!termOccurenceInCollection.ContainsKey(s))
                        termOccurenceInCollection[s] = 1;
                    else termOccurenceInDocument[s]++;
                }
                else
                    termOccurenceInDocument[s]++;

                if (max_term < termOccurenceInDocument[s]) max_term = termOccurenceInDocument[s];
            }

            //documentTermDataTable = new searchSet.Document_TermDataTable();
            //termsDataTable = new searchSet.TermsDataTable();

            foreach (KeyValuePair<string, int> entry in termOccurenceInDocument)
            {
                searchSet.TermsRow term = termsDataTable.NewTermsRow();
                term.Term = entry.Key;
                term.IDF = 1;

                try
                {
                    termsDataTable.AddTermsRow(term);
                    //termTableAdapter.Insert(entry.Key, 0);
                }
                catch (Exception x)
                {

                }
                searchSet.Document_TermRow dterm = documentTermDataTable.NewDocument_TermRow();
                dterm.Term = entry.Key;
                dterm.Document_ID = number;
                dterm.Weight = calculateTF(entry.Value, max_term);

                documentTermDataTable.AddDocument_TermRow(dterm);
                //documentTermTableAdapter.Insert(entry.Key, number, entry.Value);
            }

            if (config.idfOption == 1)
            {
                foreach (KeyValuePair<string, int> entry in termOccurenceInCollection)
                {
                    termsDataTable.FindByTerm(entry.Key).IDF = (double)number / (double) entry.Value;
                }
            }

            calculateWeight();

            // save to database
            documentTableAdapter.Update(documentDataTable);
            termTableAdapter.Update(termsDataTable);
            documentTermTableAdapter.Update(documentTermDataTable);

            if (MainForm.doc_config.normalizationOption == 1)
            {
                foreach (searchSet.DocumentsRow doc_row in documentDataTable)
                {
                    double panjang = (double) documentTermTableAdapter.HitungPanjang(doc_row.ID);
                    documentTermTableAdapter.NormalisasiWeight(panjang, doc_row.ID);
                }
            }
        }