예제 #1
0
        CreateCountersForWordsAndWordPairs
        (
            String searchTerm,
            CalculateGraphMetricsContext calculateGraphMetricsContext,
            out WordCounter wordCounter,
            out WordPairCounter wordPairCounter
        )
        {
            Debug.Assert(calculateGraphMetricsContext != null);

            // When counting word pairs, skip words in the user-supplied list.

            String [] asWordsForWordCounterToSkip = StringUtil.SplitOnSpaces(
                calculateGraphMetricsContext.GraphMetricUserSettings
                .WordMetricUserSettings.WordsToSkip);

            wordPairCounter = new WordPairCounter(asWordsForWordCounterToSkip);

            // When counting words, skip words in the user-supplied list, AND the
            // search term, AND "rt" ("reply to").

            List <String> oWordsForWordPairCounterToSkip = new List <String>(
                asWordsForWordCounterToSkip);

            if (!String.IsNullOrEmpty(searchTerm))
            {
                oWordsForWordPairCounterToSkip.AddRange(
                    StringUtil.SplitOnSpaces(searchTerm));
            }

            oWordsForWordPairCounterToSkip.Add("rt");

            wordCounter = new WordCounter(
                oWordsForWordPairCounterToSkip.ToArray());
        }
예제 #2
0
        AddGraphMetricValuesForTopWordsAndWordPairs
        (
            IEnumerable <IEdge> edges,
            String statusEdgeColumnName,
            Int32 maximumTopTerms,
            WordCounter wordCounter,
            WordPairCounter wordPairCounter,
            Int32 vertexRowID,

            List <GraphMetricValueWithID>
            topWordsInTweetByCountGraphMetricValues,

            List <GraphMetricValueWithID>
            topWordsInTweetBySalienceGraphMetricValues,

            List <GraphMetricValueWithID>
            topWordPairsInTweetByCountGraphMetricValues,

            List <GraphMetricValueWithID>
            topWordPairsInTweetBySalienceGraphMetricValues
        )
        {
            Debug.Assert(edges != null);
            Debug.Assert(!String.IsNullOrEmpty(statusEdgeColumnName));
            Debug.Assert(maximumTopTerms > 0);
            Debug.Assert(wordCounter != null);
            Debug.Assert(wordPairCounter != null);
            Debug.Assert(topWordsInTweetByCountGraphMetricValues != null);
            Debug.Assert(topWordsInTweetBySalienceGraphMetricValues != null);
            Debug.Assert(topWordPairsInTweetByCountGraphMetricValues != null);
            Debug.Assert(topWordPairsInTweetBySalienceGraphMetricValues != null);

            String sTopWordsInTweetByCount, sTopWordsInTweetBySalience,
                   sTopWordPairsInTweetByCount, sTopWordPairsInTweetBySalience;

            ConcatenateTopWordsAndWordPairs(edges, statusEdgeColumnName,
                                            maximumTopTerms, wordCounter, wordPairCounter,
                                            out sTopWordsInTweetByCount, out sTopWordsInTweetBySalience,
                                            out sTopWordPairsInTweetByCount,
                                            out sTopWordPairsInTweetBySalience);

            topWordsInTweetByCountGraphMetricValues.Add(
                new GraphMetricValueWithID(vertexRowID,
                                           ExcelTextUtil.ForceCellText(sTopWordsInTweetByCount)));

            topWordsInTweetBySalienceGraphMetricValues.Add(
                new GraphMetricValueWithID(vertexRowID,
                                           ExcelTextUtil.ForceCellText(sTopWordsInTweetBySalience)));

            topWordPairsInTweetByCountGraphMetricValues.Add(
                new GraphMetricValueWithID(vertexRowID,
                                           ExcelTextUtil.ForceCellText(sTopWordPairsInTweetByCount)));

            topWordPairsInTweetBySalienceGraphMetricValues.Add(
                new GraphMetricValueWithID(vertexRowID,
                                           ExcelTextUtil.ForceCellText(sTopWordPairsInTweetBySalience)));
        }
    TestCountTermsInDocument11()
    {
        // Test Jana Diesner's example for mutual information, which doesn't
        // skip words.

        WordPairCounter oWordPairCounter = new WordPairCounter(
            new String[] {} );

        oWordPairCounter.CountTermsInDocument("Tim and Ben play soccer.");
        oWordPairCounter.CountTermsInDocument("Tim and Sue play soccer.");
        oWordPairCounter.CountTermsInDocument("Yes we can.");
        oWordPairCounter.CountTermsInDocument("epic fail");
        oWordPairCounter.CountTermsInDocument("Tim is available now.");

        oWordPairCounter.CalculateMutualInformationOfCountedTerms();

        Assert.AreEqual(5, oWordPairCounter.TotalDocuments);
        Assert.AreEqual(19, oWordPairCounter.TotalWordsInDocuments);

        IEnumerable<CountedWordPair> oCountedWordPairs =
            oWordPairCounter.CountedTerms;

        Assert.AreEqual( 12, oCountedWordPairs.Count() );

        CountedWordPair oTheCountedWordPair;

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "tim"
                &&
                oCountedWordPair.Word2 == "and"
                &&
                oCountedWordPair.Count == 2
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 2
            ) );

        Assert.AreEqual(0.669, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "and"
                &&
                oCountedWordPair.Word2 == "ben"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "ben"
                &&
                oCountedWordPair.Word2 == "play"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "and"
                &&
                oCountedWordPair.Word2 == "sue"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "sue"
                &&
                oCountedWordPair.Word2 == "play"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "play"
                &&
                oCountedWordPair.Word2 == "soccer"
                &&
                oCountedWordPair.Count == 2
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 2
            ) );

        Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "yes"
                &&
                oCountedWordPair.Word2 == "we"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "we"
                &&
                oCountedWordPair.Word2 == "can"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "epic"
                &&
                oCountedWordPair.Word2 == "fail"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "tim"
                &&
                oCountedWordPair.Word2 == "is"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(0.669, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "is"
                &&
                oCountedWordPair.Word2 == "available"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

        oTheCountedWordPair = oCountedWordPairs.Single(
            oCountedWordPair => (
                oCountedWordPair.Word1 == "available"
                &&
                oCountedWordPair.Word2 == "now"
                &&
                oCountedWordPair.Count == 1
                &&
                oCountedWordPair.DocumentsInWhichTermWasCounted == 1
            ) );

        Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);
    }
 SetUp()
 {
     m_oWordPairCounter = new WordPairCounter(WordsToSkip);
 }
    CountTermsInEdgeOrVertex
    (
        IMetadataProvider oEdgeOrVertex,
        String sTextColumnName,
        WordCounter oWordCounter,
        WordPairCounter oWordPairCounter
    )
    {
        Debug.Assert(oEdgeOrVertex != null);
        Debug.Assert( !String.IsNullOrEmpty(sTextColumnName) );
        Debug.Assert(oWordCounter != null);
        Debug.Assert(oWordPairCounter != null);
        AssertValid();

        Object oTextAsObject;

        if ( oEdgeOrVertex.TryGetValue(sTextColumnName, typeof(String),
            out oTextAsObject ) )
        {
            String sText = (String)oTextAsObject;

            if ( !String.IsNullOrEmpty(sText) )
            {
                oWordCounter.CountTermsInDocument(sText);
                oWordPairCounter.CountTermsInDocument(sText);
            }
        }
    }
    TryCountTermsNoGroups
    (
        IGraph oGraph,
        WordMetricUserSettings oWordMetricUserSettings,
        WordCounter oWordCounter,
        WordPairCounter oWordPairCounter,
        HashSet<String> oUniqueImportedIDs,
        out GraphMetricColumn [] oGraphMetricColumns
    )
    {
        Debug.Assert(oGraph != null);
        Debug.Assert(oWordMetricUserSettings != null);
        Debug.Assert(oWordCounter != null);
        Debug.Assert(oWordPairCounter != null);
        AssertValid();

        Boolean bTextColumnIsOnEdgeWorksheet =
            oWordMetricUserSettings.TextColumnIsOnEdgeWorksheet;

        System.Collections.IEnumerable oEdgesOrVertices =
            bTextColumnIsOnEdgeWorksheet ?
            (System.Collections.IEnumerable)oGraph.Edges :
            (System.Collections.IEnumerable)oGraph.Vertices;

        // Count the terms in each of the column's cells.

        foreach ( IMetadataProvider oEdgeOrVertex in EnumerateEdgesOrVertices(
            oEdgesOrVertices, bTextColumnIsOnEdgeWorksheet, oGraph,
            oUniqueImportedIDs) )
        {
            CountTermsInEdgeOrVertex(oEdgeOrVertex,
                oWordMetricUserSettings.TextColumnName, oWordCounter,
                oWordPairCounter);
        }

        oWordCounter.CalculateSalienceOfCountedTerms();
        oWordPairCounter.CalculateSalienceOfCountedTerms();
        oWordPairCounter.CalculateMutualInformationOfCountedTerms();

        // Transfer the words and word pairs to graph metric value lists.

        List<GraphMetricValueOrdered> oWordWordValues, oWordCountValues,
            oWordSalienceValues;

        List<GraphMetricValueOrdered> oWordPairWord1Values,
            oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues;

        CreateGraphMetricValueLists(
            out oWordWordValues, out oWordCountValues, out oWordSalienceValues,
            
            out oWordPairWord1Values, out oWordPairWord2Values,
            out oWordPairCountValues, out oWordPairSalienceValues,
            out oWordPairMutualInformationValues
            );

        foreach (CountedWord oCountedWord in oWordCounter.CountedTerms)
        {
            AddCountedWordToValueLists(oCountedWord, oWordMetricUserSettings,
                oWordWordValues, oWordCountValues, oWordSalienceValues);
        }

        foreach (CountedWordPair oCountedWordPair in
            oWordPairCounter.CountedTerms)
        {
            AddCountedWordPairToValueLists(oCountedWordPair,
                oWordMetricUserSettings, oWordPairWord1Values,
                oWordPairWord2Values, oWordPairCountValues,
                oWordPairSalienceValues, oWordPairMutualInformationValues);
        }

        oGraphMetricColumns = CreateGraphMetricColumns(
            oWordWordValues, oWordCountValues, oWordSalienceValues, null,

            oWordPairWord1Values, oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues, null
            );

        return (true);
    }
    TryCountVertexTermsByGroup
    (
        IGraph oGraph,
        WordMetricUserSettings oWordMetricUserSettings,
        WordCounter oWordCounter,
        WordPairCounter oWordPairCounter,
        HashSet<String> oUniqueImportedIDs,
        out GraphMetricColumn [] oGraphMetricColumns
    )
    {
        Debug.Assert(oGraph != null);
        Debug.Assert(oWordMetricUserSettings != null);
        Debug.Assert(oWordCounter != null);
        Debug.Assert(oWordPairCounter != null);
        AssertValid();

        List<GraphMetricValueOrdered> oWordWordValues, oWordCountValues,
            oWordSalienceValues;

        List<GraphMetricValueOrdered> oWordPairWord1Values,
            oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues;

        CreateGraphMetricValueLists(
            out oWordWordValues, out oWordCountValues, out oWordSalienceValues,
            
            out oWordPairWord1Values, out oWordPairWord2Values,
            out oWordPairCountValues, out oWordPairSalienceValues,
            out oWordPairMutualInformationValues
            );

        List<GraphMetricValueOrdered> oWordGroupNameValues =
            new List<GraphMetricValueOrdered>();

        List<GraphMetricValueOrdered> oWordPairGroupNameValues =
            new List<GraphMetricValueOrdered>();

        // Get a list of the graph's groups, adding a dummy group for the
        // entire graph and another to contain any non-grouped vertices.

        foreach ( GroupInfo oGroup in
            EnumerateGroupsForCountingVertexTerms(oGraph) )
        {
            // Count the terms in this group.

            oWordCounter.Clear();
            oWordPairCounter.Clear();

            foreach ( IVertex oVertex in EnumerateEdgesOrVertices(
                oGroup.Vertices, false, oGraph, oUniqueImportedIDs) )
            {
                CountTermsInEdgeOrVertex(oVertex,
                    oWordMetricUserSettings.TextColumnName, oWordCounter,
                    oWordPairCounter);
            }

            oWordCounter.CalculateSalienceOfCountedTerms();
            oWordPairCounter.CalculateSalienceOfCountedTerms();
            oWordPairCounter.CalculateMutualInformationOfCountedTerms();

            // Transfer the words and word pairs to the graph metric value
            // lists.

            AddCountedWordsToValueLists(oWordCounter.CountedTerms,
                oWordMetricUserSettings, oGroup.Name, oWordWordValues,
                oWordCountValues, oWordSalienceValues, oWordGroupNameValues);

            AddCountedWordPairsToValueLists(oWordPairCounter.CountedTerms,
                oWordMetricUserSettings, oGroup.Name, oWordPairWord1Values,
                oWordPairWord2Values, oWordPairCountValues,
                oWordPairSalienceValues, oWordPairMutualInformationValues,
                oWordPairGroupNameValues);
        }

        oGraphMetricColumns = CreateGraphMetricColumns(
            oWordWordValues, oWordCountValues, oWordSalienceValues,
            oWordGroupNameValues,
        
            oWordPairWord1Values, oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues,
            oWordPairGroupNameValues
            );

        return (true);
    }
    TryCountEdgeTermsByGroup
    (
        IGraph oGraph,
        WordMetricUserSettings oWordMetricUserSettings,
        WordCounter oWordCounter,
        WordPairCounter oWordPairCounter,
        HashSet<String> oUniqueImportedIDs,
        out GraphMetricColumn [] oGraphMetricColumns
    )
    {
        Debug.Assert(oGraph != null);
        Debug.Assert(oWordMetricUserSettings != null);
        Debug.Assert(oWordCounter != null);
        Debug.Assert(oWordPairCounter != null);
        AssertValid();

        List<GraphMetricValueOrdered> oWordWordValues, oWordCountValues,
            oWordSalienceValues;

        List<GraphMetricValueOrdered> oWordPairWord1Values,
            oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues;

        CreateGraphMetricValueLists(
            out oWordWordValues, out oWordCountValues, out oWordSalienceValues,
            
            out oWordPairWord1Values, out oWordPairWord2Values,
            out oWordPairCountValues, out oWordPairSalienceValues,
            out oWordPairMutualInformationValues
            );

        List<GraphMetricValueOrdered> oWordGroupNameValues =
            new List<GraphMetricValueOrdered>();

        List<GraphMetricValueOrdered> oWordPairGroupNameValues =
            new List<GraphMetricValueOrdered>();

        // Get the edges in each of the graph's groups.  Include a "dummy"
        // group that contains the edges that aren't contained in any real
        // groups.

        foreach ( GroupEdgeInfo oGroupEdgeInfo in
            GroupEdgeSorter.SortGroupEdges(oGraph, Int32.MaxValue,
                true, true) )
        {
            // Count the terms in this group.

            oWordCounter.Clear();
            oWordPairCounter.Clear();

            foreach ( IEdge oEdge in EnumerateEdgesOrVertices(
                oGroupEdgeInfo.Edges, true, oGraph, oUniqueImportedIDs) )
            {
                CountTermsInEdgeOrVertex(oEdge,
                    oWordMetricUserSettings.TextColumnName, oWordCounter,
                    oWordPairCounter);
            }

            oWordCounter.CalculateSalienceOfCountedTerms();
            oWordPairCounter.CalculateSalienceOfCountedTerms();
            oWordPairCounter.CalculateMutualInformationOfCountedTerms();

            // Transfer the words and word pairs to the graph metric value
            // lists.

            String sGroupName = oGroupEdgeInfo.GroupName;

            AddCountedWordsToValueLists( oWordCounter.CountedTerms,
                oWordMetricUserSettings, sGroupName, oWordWordValues,
                oWordCountValues, oWordSalienceValues, oWordGroupNameValues);

            AddCountedWordPairsToValueLists( oWordPairCounter.CountedTerms,
                oWordMetricUserSettings, sGroupName, oWordPairWord1Values,
                oWordPairWord2Values, oWordPairCountValues,
                oWordPairSalienceValues, oWordPairMutualInformationValues,
                oWordPairGroupNameValues);

            if (
                sGroupName == GroupEdgeSorter.DummyGroupNameForEntireGraph
                &&
                oUniqueImportedIDs != null
                )
            {
                // This is the dummy group that stores all the edges in the
                // graph.  Note that SortGroupEdges() guarantees that this is
                // the first group, so the imported IDs need to be cleared only
                // once within this loop.

                oUniqueImportedIDs.Clear();
            }
        }

        oGraphMetricColumns = CreateGraphMetricColumns(
            oWordWordValues, oWordCountValues, oWordSalienceValues,
            oWordGroupNameValues,
        
            oWordPairWord1Values, oWordPairWord2Values, oWordPairCountValues,
            oWordPairSalienceValues, oWordPairMutualInformationValues,
            oWordPairGroupNameValues
            );

        return (true);
    }
    TryCalculateGraphMetrics
    (
        IGraph graph,
        CalculateGraphMetricsContext calculateGraphMetricsContext,
        out GraphMetricColumn [] graphMetricColumns
    )
    {
        Debug.Assert(graph != null);
        Debug.Assert(calculateGraphMetricsContext != null);
        AssertValid();

        graphMetricColumns = new GraphMetricColumn[0];

        WordMetricUserSettings oWordMetricUserSettings =
            calculateGraphMetricsContext.GraphMetricUserSettings
            .WordMetricUserSettings;

        if (
            !calculateGraphMetricsContext.ShouldCalculateGraphMetrics(
                GraphMetrics.Words)
            ||
            String.IsNullOrEmpty(oWordMetricUserSettings.TextColumnName)
            )
        {
            return (true);
        }

        String [] asWordsToSkip = StringUtil.SplitOnCommonDelimiters(
            oWordMetricUserSettings.WordsToSkip);

        WordCounter oWordCounter = new WordCounter(asWordsToSkip);
        WordPairCounter oWordPairCounter = new WordPairCounter(asWordsToSkip);

        // The edges or vertices may have unique imported IDs.  If so, this
        // becomes a collection of the IDs.

        HashSet<String> oUniqueImportedIDs = 
            EdgesOrVerticesHaveImportedIDs(graph,
                oWordMetricUserSettings.TextColumnIsOnEdgeWorksheet) ?
            new HashSet<String>() : null;

        if (oWordMetricUserSettings.CountByGroup)
        {
            if (oWordMetricUserSettings.TextColumnIsOnEdgeWorksheet)
            {
                return ( TryCountEdgeTermsByGroup(graph,
                    oWordMetricUserSettings, oWordCounter, oWordPairCounter,
                    oUniqueImportedIDs, out graphMetricColumns) );
            }
            else
            {
                return ( TryCountVertexTermsByGroup(graph,
                    oWordMetricUserSettings, oWordCounter, oWordPairCounter,
                    oUniqueImportedIDs, out graphMetricColumns) );
            }
        }
        else
        {
            return ( TryCountTermsNoGroups(graph,
                oWordMetricUserSettings, oWordCounter, oWordPairCounter,
                oUniqueImportedIDs, out graphMetricColumns) );
        }
    }
예제 #10
0
        TestCountTermsInDocument11()
        {
            // Test Jana Diesner's example for mutual information, which doesn't
            // skip words.

            WordPairCounter oWordPairCounter = new WordPairCounter(
                new String[] {});

            oWordPairCounter.CountTermsInDocument("Tim and Ben play soccer.");
            oWordPairCounter.CountTermsInDocument("Tim and Sue play soccer.");
            oWordPairCounter.CountTermsInDocument("Yes we can.");
            oWordPairCounter.CountTermsInDocument("epic fail");
            oWordPairCounter.CountTermsInDocument("Tim is available now.");

            oWordPairCounter.CalculateMutualInformationOfCountedTerms();

            Assert.AreEqual(5, oWordPairCounter.TotalDocuments);
            Assert.AreEqual(19, oWordPairCounter.TotalWordsInDocuments);

            IEnumerable <CountedWordPair> oCountedWordPairs =
                oWordPairCounter.CountedTerms;

            Assert.AreEqual(12, oCountedWordPairs.Count());

            CountedWordPair oTheCountedWordPair;

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "tim"
                    &&
                    oCountedWordPair.Word2 == "and"
                    &&
                    oCountedWordPair.Count == 2
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 2
                    ));

            Assert.AreEqual(0.669, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "and"
                    &&
                    oCountedWordPair.Word2 == "ben"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "ben"
                    &&
                    oCountedWordPair.Word2 == "play"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "and"
                    &&
                    oCountedWordPair.Word2 == "sue"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "sue"
                    &&
                    oCountedWordPair.Word2 == "play"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "play"
                    &&
                    oCountedWordPair.Word2 == "soccer"
                    &&
                    oCountedWordPair.Count == 2
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 2
                    ));

            Assert.AreEqual(0.845, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "yes"
                    &&
                    oCountedWordPair.Word2 == "we"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "we"
                    &&
                    oCountedWordPair.Word2 == "can"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "epic"
                    &&
                    oCountedWordPair.Word2 == "fail"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "tim"
                    &&
                    oCountedWordPair.Word2 == "is"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(0.669, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "is"
                    &&
                    oCountedWordPair.Word2 == "available"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);

            oTheCountedWordPair = oCountedWordPairs.Single(
                oCountedWordPair => (
                    oCountedWordPair.Word1 == "available"
                    &&
                    oCountedWordPair.Word2 == "now"
                    &&
                    oCountedWordPair.Count == 1
                    &&
                    oCountedWordPair.DocumentsInWhichTermWasCounted == 1
                    ));

            Assert.AreEqual(1.146, oTheCountedWordPair.MutualInformation, 0.001);
        }
예제 #11
0
 SetUp()
 {
     m_oWordPairCounter = new WordPairCounter(WordsToSkip);
 }
    ConcatenateTopWordsAndWordPairs
    (
        IEnumerable<IEdge> oEdges,
        String sStatusEdgeColumnName,
        Int32 iMaximumTopStrings,
        WordCounter oWordCounter,
        WordPairCounter oWordPairCounter,
        out String sTopWordsInTweetByCount,
        out String sTopWordsInTweetBySalience,
        out String sTopWordPairsInTweetByCount,
        out String sTopWordPairsInTweetBySalience
    )
    {
        Debug.Assert(oEdges != null);
        Debug.Assert( !String.IsNullOrEmpty(sStatusEdgeColumnName) );
        Debug.Assert(iMaximumTopStrings > 0);
        Debug.Assert(oWordCounter != null);
        Debug.Assert(oWordPairCounter != null);

        oWordCounter.Clear();
        oWordPairCounter.Clear();

        foreach (IEdge oEdge in oEdges)
        {
            String sStatus;

            if ( oEdge.TryGetNonEmptyStringValue(sStatusEdgeColumnName,
                out sStatus) )
            {
                oWordCounter.CountTermsInDocument(sStatus);
                oWordPairCounter.CountTermsInDocument(sStatus);
            }
        }

        oWordCounter.CalculateSalienceOfCountedTerms();
        oWordPairCounter.CalculateSalienceOfCountedTerms();

        sTopWordsInTweetByCount = String.Join(

            TwitterSearchNetworkWordMetricUtil.WordSeparator,

            TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                (from CountedWord oCountedWord in oWordCounter.CountedTerms
                orderby oCountedWord.Count descending
                select oCountedWord.Word)
                ,
                iMaximumTopStrings
            ) );

        sTopWordsInTweetBySalience = String.Join(
        
            TwitterSearchNetworkWordMetricUtil.WordSeparator,

            TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                (from CountedWord oCountedWord in oWordCounter.CountedTerms
                orderby oCountedWord.Salience descending
                select oCountedWord.Word)
                ,
                iMaximumTopStrings
            ) );

        sTopWordPairsInTweetByCount = String.Join(
        
            TwitterSearchNetworkWordMetricUtil.WordPairSeparator,

            TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                (from CountedWordPair oCountedWordPair in
                    oWordPairCounter.CountedTerms

                orderby oCountedWordPair.Count descending

                select TwitterSearchNetworkWordMetricUtil.FormatWordPair(
                    oCountedWordPair) )
                ,
                iMaximumTopStrings
            ) );

        sTopWordPairsInTweetBySalience = String.Join(

            TwitterSearchNetworkWordMetricUtil.WordPairSeparator,

            TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                (from CountedWordPair oCountedWordPair in
                    oWordPairCounter.CountedTerms

                orderby oCountedWordPair.Salience descending

                select TwitterSearchNetworkWordMetricUtil.FormatWordPair(
                    oCountedWordPair) )
                ,
                iMaximumTopStrings
            ) );
    }
    CreateCountersForWordsAndWordPairs
    (
        String searchTerm,
        CalculateGraphMetricsContext calculateGraphMetricsContext,
        out WordCounter wordCounter,
        out WordPairCounter wordPairCounter
    )
    {
        Debug.Assert(calculateGraphMetricsContext != null);

        // When counting word pairs, skip words in the user-supplied list.

        String [] asWordsForWordCounterToSkip = StringUtil.SplitOnSpaces(
            calculateGraphMetricsContext.GraphMetricUserSettings
            .WordMetricUserSettings.WordsToSkip);

        wordPairCounter = new WordPairCounter(asWordsForWordCounterToSkip);

        // When counting words, skip words in the user-supplied list, AND the
        // search term, AND "rt" ("reply to").

        List<String> oWordsForWordPairCounterToSkip = new List<String>(
            asWordsForWordCounterToSkip);

        if ( !String.IsNullOrEmpty(searchTerm) )
        {
            oWordsForWordPairCounterToSkip.AddRange(
                StringUtil.SplitOnSpaces(searchTerm) );
        }

        oWordsForWordPairCounterToSkip.Add("rt");

        wordCounter = new WordCounter(
            oWordsForWordPairCounterToSkip.ToArray() );
    }
    AddGraphMetricValuesForTopWordsAndWordPairs
    (
        IEnumerable<IEdge> edges,
        String statusEdgeColumnName,
        Int32 maximumTopTerms,
        WordCounter wordCounter,
        WordPairCounter wordPairCounter,
        Int32 vertexRowID,

        List<GraphMetricValueWithID>
            topWordsInTweetByCountGraphMetricValues,

        List<GraphMetricValueWithID>
            topWordsInTweetBySalienceGraphMetricValues,

        List<GraphMetricValueWithID>
            topWordPairsInTweetByCountGraphMetricValues,

        List<GraphMetricValueWithID>
            topWordPairsInTweetBySalienceGraphMetricValues
    )
    {
        Debug.Assert(edges != null);
        Debug.Assert( !String.IsNullOrEmpty(statusEdgeColumnName) );
        Debug.Assert(maximumTopTerms > 0);
        Debug.Assert(wordCounter != null);
        Debug.Assert(wordPairCounter != null);
        Debug.Assert(topWordsInTweetByCountGraphMetricValues != null);
        Debug.Assert(topWordsInTweetBySalienceGraphMetricValues != null);
        Debug.Assert(topWordPairsInTweetByCountGraphMetricValues != null);
        Debug.Assert(topWordPairsInTweetBySalienceGraphMetricValues != null);

        String sTopWordsInTweetByCount, sTopWordsInTweetBySalience,
            sTopWordPairsInTweetByCount, sTopWordPairsInTweetBySalience;

        ConcatenateTopWordsAndWordPairs(edges, statusEdgeColumnName,
            maximumTopTerms, wordCounter, wordPairCounter,
            out sTopWordsInTweetByCount, out sTopWordsInTweetBySalience,
            out sTopWordPairsInTweetByCount,
            out sTopWordPairsInTweetBySalience);

        topWordsInTweetByCountGraphMetricValues.Add(
            new GraphMetricValueWithID( vertexRowID,
                ExcelUtil.ForceCellText(sTopWordsInTweetByCount) ) );

        topWordsInTweetBySalienceGraphMetricValues.Add(
            new GraphMetricValueWithID( vertexRowID,
                ExcelUtil.ForceCellText(sTopWordsInTweetBySalience) ) );

        topWordPairsInTweetByCountGraphMetricValues.Add(
            new GraphMetricValueWithID( vertexRowID,
                ExcelUtil.ForceCellText(sTopWordPairsInTweetByCount) ) );

        topWordPairsInTweetBySalienceGraphMetricValues.Add(
            new GraphMetricValueWithID( vertexRowID,
                ExcelUtil.ForceCellText(sTopWordPairsInTweetBySalience) ) );
    }
예제 #15
0
        ConcatenateTopWordsAndWordPairs
        (
            IEnumerable <IEdge> oEdges,
            String sStatusEdgeColumnName,
            Int32 iMaximumTopStrings,
            WordCounter oWordCounter,
            WordPairCounter oWordPairCounter,
            out String sTopWordsInTweetByCount,
            out String sTopWordsInTweetBySalience,
            out String sTopWordPairsInTweetByCount,
            out String sTopWordPairsInTweetBySalience
        )
        {
            Debug.Assert(oEdges != null);
            Debug.Assert(!String.IsNullOrEmpty(sStatusEdgeColumnName));
            Debug.Assert(iMaximumTopStrings > 0);
            Debug.Assert(oWordCounter != null);
            Debug.Assert(oWordPairCounter != null);

            oWordCounter.Clear();
            oWordPairCounter.Clear();

            foreach (IEdge oEdge in oEdges)
            {
                String sStatus;

                if (oEdge.TryGetNonEmptyStringValue(sStatusEdgeColumnName,
                                                    out sStatus))
                {
                    oWordCounter.CountTermsInDocument(sStatus);
                    oWordPairCounter.CountTermsInDocument(sStatus);
                }
            }

            oWordCounter.CalculateSalienceOfCountedTerms();
            oWordPairCounter.CalculateSalienceOfCountedTerms();

            sTopWordsInTweetByCount = String.Join(

                TwitterSearchNetworkWordMetricUtil.WordSeparator,

                TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                    (from CountedWord oCountedWord in oWordCounter.CountedTerms
                     orderby oCountedWord.Count descending
                     select oCountedWord.Word)
                    ,
                    iMaximumTopStrings
                    ));

            sTopWordsInTweetBySalience = String.Join(

                TwitterSearchNetworkWordMetricUtil.WordSeparator,

                TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                    (from CountedWord oCountedWord in oWordCounter.CountedTerms
                     orderby oCountedWord.Salience descending
                     select oCountedWord.Word)
                    ,
                    iMaximumTopStrings
                    ));

            sTopWordPairsInTweetByCount = String.Join(

                TwitterSearchNetworkWordMetricUtil.WordPairSeparator,

                TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                    (from CountedWordPair oCountedWordPair in
                     oWordPairCounter.CountedTerms

                     orderby oCountedWordPair.Count descending

                     select TwitterSearchNetworkWordMetricUtil.FormatWordPair(
                         oCountedWordPair))
                    ,
                    iMaximumTopStrings
                    ));

            sTopWordPairsInTweetBySalience = String.Join(

                TwitterSearchNetworkWordMetricUtil.WordPairSeparator,

                TwitterSearchNetworkStringUtil.TakeTopStringsAsArray(

                    (from CountedWordPair oCountedWordPair in
                     oWordPairCounter.CountedTerms

                     orderby oCountedWordPair.Salience descending

                     select TwitterSearchNetworkWordMetricUtil.FormatWordPair(
                         oCountedWordPair))
                    ,
                    iMaximumTopStrings
                    ));
        }