public Dictionary <string, double> CalculateSimilarity(TermDocumentMatrixModel documentMatrix, string query)
        {
            Dictionary <string, int>    isTermInQuery = CountFrequencyOfTermQuery(query);
            Dictionary <string, double> tfOfQuery     = new Dictionary <string, double>();
            Dictionary <string, double> idfOfQuery    = new Dictionary <string, double>();
            Dictionary <string, double> tf_idfOfQuery = new Dictionary <string, double>();
            double vectorNorm = 0.0;

            //tf of query , idf of query , tf_idf of query , vectorNorm
            foreach (var item in isTermInQuery)
            {
                int numberOfTermInDocument = 0;
                tfOfQuery.Add(item.Key, new TermDocumentMatrix().CalculateTF(item.Value));
                if (documentMatrix.IsTermInDocument.ContainsKey(item.Key))
                {
                    numberOfTermInDocument = documentMatrix.IsTermInDocument[item.Key];
                }
                idfOfQuery.Add(item.Key, new TermDocumentMatrix().CalculateIDF(numberOfTermInDocument, documentMatrix.DocumentList.Count()));
                tf_idfOfQuery.Add(item.Key, new TermDocumentMatrix().CalculateTF_IDF(tfOfQuery[item.Key], idfOfQuery[item.Key]));
                vectorNorm = vectorNorm + Math.Pow(tf_idfOfQuery[item.Key], 2);
            }
            vectorNorm = Math.Round(Math.Sqrt(vectorNorm), 3);
            //cosine similarity
            var result = CalcaulateCosineSimilarity(tf_idfOfQuery, vectorNorm, documentMatrix);

            return(result);
        }
Ejemplo n.º 2
0
        public TermDocumentMatrixModel SettingTermDocumentMatrix(TermDocumentMatrixModel documentMatrix)
        {
            Dictionary <string, int> IsTermInDocument = new Dictionary <string, int>();
            Dictionary <string, Dictionary <string, double> > TFIndexTermOfDocumentList = new Dictionary <string, Dictionary <string, double> >();
            Dictionary <string, double> IDFIndexTermOfDocument = new Dictionary <string, double>();
            Dictionary <string, Dictionary <string, double> > TF_IDFIndexTermOfDocumentList = new Dictionary <string, Dictionary <string, double> >();

            //Term all
            foreach (var item in documentMatrix.IndexTermList)
            {
                int countTermInDocument = 0;
                Dictionary <string, double> TFIndexTermOfDocument     = new Dictionary <string, double>();
                Dictionary <string, double> TF_IDFIndexTermOfDocument = new Dictionary <string, double>();
                //document
                foreach (var indexdocterm in documentMatrix.DocumentList)
                {
                    //Ni
                    if (indexdocterm.IndexTermDocuments.ContainsKey(item))
                    {
                        countTermInDocument++;
                        //Calculate TF
                        TFIndexTermOfDocument.Add(indexdocterm.Name, CalculateTF(indexdocterm.IndexTermDocuments[item]));
                    }
                    else
                    {
                        //TF = 0
                        TFIndexTermOfDocument.Add(indexdocterm.Name, 0);
                    }
                }
                //Calculate IDF
                IDFIndexTermOfDocument.Add(item, CalculateIDF(countTermInDocument, documentMatrix.DocumentList.Count()));
                IsTermInDocument.Add(item, countTermInDocument);
                TFIndexTermOfDocumentList.Add(item, TFIndexTermOfDocument);


                foreach (var document in documentMatrix.DocumentList)
                {
                    //calulate IF_IDF
                    var tfOfDocumentList = TFIndexTermOfDocumentList[item];
                    var result           = CalculateTF_IDF(tfOfDocumentList[document.Name], IDFIndexTermOfDocument[item]);
                    TF_IDFIndexTermOfDocument.Add(document.Name, result);
                    document.VectorNorm = document.VectorNorm + Math.Pow(result, 2);
                }
                TF_IDFIndexTermOfDocumentList.Add(item, TF_IDFIndexTermOfDocument);
            }
            //VectorNorm
            foreach (var document in documentMatrix.DocumentList)
            {
                document.VectorNorm = Math.Round(Math.Sqrt(document.VectorNorm), 3);
            }
            documentMatrix.IDFIndexTermOfDocument    = IDFIndexTermOfDocument;
            documentMatrix.TFIndexTermOfDocument     = TFIndexTermOfDocumentList;
            documentMatrix.IsTermInDocument          = IsTermInDocument;
            documentMatrix.TF_IDFIndexTermOfDocument = TF_IDFIndexTermOfDocumentList;
            return(documentMatrix);
        }
Ejemplo n.º 3
0
 public void DisplayHead(TermDocumentMatrixModel documentMatrixModel)
 {
     Console.WriteLine("==[ Vector Model ]==");
     Console.WriteLine(string.Format("\nInformation: We use these {0} articles from Medium.com for demonstrating the Vector Model", documentMatrixModel.DocumentList.Count));
     for (int i = 0; i < documentMatrixModel.DocumentList.Count; i++)
     {
         Console.WriteLine(string.Format("({0}) {1}", i + 1, documentMatrixModel.DocumentList[i].Name));
     }
     Console.WriteLine("The result will show the retrieved document and similarity of Vector Model");
     Console.WriteLine("\n------------------------------\n");
 }
Ejemplo n.º 4
0
 public void DisplayBodyResult(TermDocumentMatrixModel documentMatrixModel, Dictionary <string, double> cosineSimilarity, string query)
 {
     Console.WriteLine("\n\n=[ Similarity ]=\n");
     Console.WriteLine("Query is :{0}", query);
     ConsoleTableBuilder.From(GetSampleTableData(cosineSimilarity)).ExportAndWriteLine();
 }
        public Dictionary <string, double> CalcaulateCosineSimilarity(Dictionary <string, double> tf_idfQuery, double vectorNorm, TermDocumentMatrixModel documentMatrix)
        {
            Dictionary <string, double> cosineSimilarity = new Dictionary <string, double>();

            foreach (var item in documentMatrix.DocumentList)
            {
                //follow by document
                double crossWeight                = 0.0;
                double crossSumVectorNorm         = 0.0;
                double cosineSimilarityOfDocument = 0.0;
                foreach (var tf_idfQItem in tf_idfQuery)
                {
                    if (documentMatrix.TF_IDFIndexTermOfDocument.ContainsKey(tf_idfQItem.Key))
                    {
                        var tF_IDFList = documentMatrix.TF_IDFIndexTermOfDocument[tf_idfQItem.Key];
                        crossWeight = crossWeight + (tF_IDFList[item.Name] * tf_idfQItem.Value);
                    }
                    else
                    {
                        crossWeight = crossWeight + (0 * tf_idfQItem.Value);
                    }
                }
                crossSumVectorNorm         = Math.Round(item.VectorNorm * vectorNorm, 3);
                cosineSimilarityOfDocument = Math.Round(crossWeight / crossSumVectorNorm, 3);
                cosineSimilarity.Add(item.Name, cosineSimilarityOfDocument);
            }
            return(cosineSimilarity);
        }