public static void WordCountExample() { #if local // This overload runs the computation on your local computer using a single worker var config = new DryadLinqContext(1); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #else string clusterName = "Replace with your HDInsight 3.0 cluster name"; // to use the davinci.txt example input below, select your cluster's default // storage account and container, which automatically includes the sample text string accountName = "Replace with a storage account name"; string containerName = "Replace with a storage container name"; // This overload creates an Azure-based computation var config = new DryadLinqContext(clusterName); config.JobFriendlyName = "DryadLINQ Sample Wordcount"; // plain text files should be read as type LineRecord var input = config.FromStore<LineRecord>(AzureUtils.ToAzureUri(accountName, containerName, "example/data/gutenberg/davinci.txt")); #endif var words = input.SelectMany(x => x.Line.Split(' ')); var groups = words.GroupBy(x => x); var counts = groups.Select(x => new KeyValuePair<string, int>(x.Key, x.Count())); var toOutput = counts.Select(x => new LineRecord(String.Format("{0}: {1}", x.Key, x.Value))); #if local // any collection computed by the query can be materialized back at the client, // not just the 'output' collection. For large collections this is expensive! foreach (LineRecord line in toOutput) { Console.WriteLine(line.Line); } #else // the 'true' parameter to ToStore means the output will be over-written if you run // the job more than once var info = toOutput.ToStore(AzureUtils.ToAzureUri(accountName, containerName, "wc-out.txt"), true).SubmitAndWait(); #endif }
// Helper for IncrementalMeasureLinesAfterInsert, IncrementalMeasureLinesAfterDelete. // Formats line until we hit a synchronization point, a position where we know // following lines could not be affected by the change. private void SyncLineMetrics(DirtyTextRange range, double constraintWidth, LineProperties lineProperties, TextBoxLine line, bool endOfParagraph, int lineIndex, int lineOffset) { bool offsetSyncOk = (range.PositionsAdded == 0 || range.PositionsRemoved == 0); int lastCoveredCharOffset = range.StartIndex + Math.Max(range.PositionsAdded, range.PositionsRemoved); // Keep updating lines until we find a synchronized position. while (!endOfParagraph && (lineIndex == _lineMetrics.Count || !offsetSyncOk || lineOffset != _lineMetrics[lineIndex].Offset)) { if (lineIndex < _lineMetrics.Count && lineOffset >= _lineMetrics[lineIndex].EndOffset) { // If the current line offset starts past the current line metric offset, // remove the metric. This happens when the previous line // frees up enough space to completely consume the following line. // We can't simply replace the record without potentially missing our // [....] position. _lineMetrics.RemoveAt(lineIndex); // RemoveLineVisualRange(lineIndex, 1); } else { using (line) { line.Format(lineOffset, constraintWidth, constraintWidth, lineProperties, _cache.TextRunCache, _cache.TextFormatter); LineRecord record = new LineRecord(lineOffset, line); if (lineIndex == _lineMetrics.Count || lineOffset + line.Length <= _lineMetrics[lineIndex].Offset) { // The new line preceeds the old line, insert a new record. // _lineMetrics.Insert(lineIndex, record); AddLineVisualPlaceholder(lineIndex); } else { // We expect to be colliding with the old line directly. // If we extend past it, we're in danger of needlessly // re-formatting the entire doc (ie, we miss the real // [....] position and don't stop until EndOfParagraph). Invariant.Assert(lineOffset < _lineMetrics[lineIndex].EndOffset); _lineMetrics[lineIndex] = record; ClearLineVisual(lineIndex); // If this line ends past the invalidated region, and it // has a hard line break, it's safe to synchronize on the next // line metric with a matching start offset. offsetSyncOk |= lastCoveredCharOffset <= record.EndOffset && line.HasLineBreak; } lineIndex++; lineOffset += line.Length; endOfParagraph = line.EndOfParagraph; } } } // Remove any trailing lines that got absorbed into the new last line. if (endOfParagraph && lineIndex < _lineMetrics.Count) { int count = _lineMetrics.Count - lineIndex; _lineMetrics.RemoveRange(lineIndex, count); RemoveLineVisualRange(lineIndex, count); } }
// Helper for IncrementalMeasureLinesAfterInsert, IncrementalMeasureLinesAfterDelete. // Formats the line preceding the first directly affected line after a TextContainer change. // In general this line might grow as content in the following line is absorbed. private void FormatFirstIncrementalLine(int lineIndex, double constraintWidth, LineProperties lineProperties, TextBoxLine line, out int lineOffset, out bool endOfParagraph) { int originalEndOffset = _lineMetrics[lineIndex].EndOffset; lineOffset = _lineMetrics[lineIndex].Offset; using (line) { line.Format(lineOffset, constraintWidth, constraintWidth, lineProperties, _cache.TextRunCache, _cache.TextFormatter); _lineMetrics[lineIndex] = new LineRecord(lineOffset, line); lineOffset += line.Length; endOfParagraph = line.EndOfParagraph; } // Don't clear the cached Visual unless something changed. if (originalEndOffset != _lineMetrics[lineIndex].EndOffset) { ClearLineVisual(lineIndex); } }
// Measures content invalidated due to a TextContainer change. private void IncrementalMeasureLinesAfterDelete(double constraintWidth, LineProperties lineProperties, DirtyTextRange range, ref Size desiredSize) { int delta = range.PositionsAdded - range.PositionsRemoved; Invariant.Assert(delta < 0); int firstLineIndex = GetLineIndexFromOffset(range.StartIndex); // Clip the scope of the affected lines to the region of the document // we've already inspected. Clipping happens when background layout // has not yet completed but an incremental update happens. int endOffset = range.StartIndex + -delta - 1; if (endOffset > _lineMetrics[_lineMetrics.Count - 1].EndOffset) { Invariant.Assert(this.IsBackgroundLayoutPending); endOffset = _lineMetrics[_lineMetrics.Count - 1].EndOffset; if (range.StartIndex == endOffset) { // Nothing left to do until background layout runs. return; } } int lastLineIndex = GetLineIndexFromOffset(endOffset); // Increment the offsets of all following lines. // for (int i = lastLineIndex + 1; i < _lineMetrics.Count; i++) { _lineMetrics[i].Offset += delta; } TextBoxLine line = new TextBoxLine(this); int lineIndex = firstLineIndex; int lineOffset; bool endOfParagraph; // We need to re-format the previous line, because if someone inserted // a hard break, the first directly affected line might now be shorter // and mergeable with its predecessor. if (lineIndex > 0) // { FormatFirstIncrementalLine(lineIndex - 1, constraintWidth, lineProperties, line, out lineOffset, out endOfParagraph); } else { lineOffset = _lineMetrics[lineIndex].Offset; endOfParagraph = false; } // // Update the first affected line. If it's completely covered, remove it entirely below. if (!endOfParagraph && (range.StartIndex > lineOffset || range.StartIndex + -delta < _lineMetrics[lineIndex].EndOffset)) { // Only part of the line is covered, reformat it. using (line) { line.Format(lineOffset, constraintWidth, constraintWidth, lineProperties, _cache.TextRunCache, _cache.TextFormatter); _lineMetrics[lineIndex] = new LineRecord(lineOffset, line); lineOffset += line.Length; endOfParagraph = line.EndOfParagraph; } ClearLineVisual(lineIndex); lineIndex++; } // Remove all the following lines that are completely covered. // _lineMetrics.RemoveRange(lineIndex, lastLineIndex - lineIndex + 1); RemoveLineVisualRange(lineIndex, lastLineIndex - lineIndex + 1); // Recalc the following lines not directly affected as needed. SyncLineMetrics(range, constraintWidth, lineProperties, line, endOfParagraph, lineIndex, lineOffset); desiredSize = BruteForceCalculateDesiredSize(); }
// Measures content invalidated due to a TextContainer change. private void IncrementalMeasureLinesAfterInsert(double constraintWidth, LineProperties lineProperties, DirtyTextRange range, ref Size desiredSize) { int delta = range.PositionsAdded - range.PositionsRemoved; Invariant.Assert(delta >= 0); int lineIndex = GetLineIndexFromOffset(range.StartIndex, LogicalDirection.Forward); if (delta > 0) { // Increment of the offsets of all following lines. // for (int i = lineIndex + 1; i < _lineMetrics.Count; i++) { _lineMetrics[i].Offset += delta; } } TextBoxLine line = new TextBoxLine(this); int lineOffset; bool endOfParagraph = false; // We need to re-format the previous line, because if someone inserted // a hard break, the first directly affected line might now be shorter // and mergeable with its predecessor. if (lineIndex > 0) // { FormatFirstIncrementalLine(lineIndex - 1, constraintWidth, lineProperties, line, out lineOffset, out endOfParagraph); } else { lineOffset = _lineMetrics[lineIndex].Offset; } // Format the line directly affected by the change. // If endOfParagraph == true, then the line was absorbed into its // predessor (because its new content is thinner, or because the // TextWrapping property changed). if (!endOfParagraph) { using (line) { line.Format(lineOffset, constraintWidth, constraintWidth, lineProperties, _cache.TextRunCache, _cache.TextFormatter); _lineMetrics[lineIndex] = new LineRecord(lineOffset, line); lineOffset += line.Length; endOfParagraph = line.EndOfParagraph; } ClearLineVisual(lineIndex); lineIndex++; } // Recalc the following lines not directly affected as needed. SyncLineMetrics(range, constraintWidth, lineProperties, line, endOfParagraph, lineIndex, lineOffset); desiredSize = BruteForceCalculateDesiredSize(); }
public static void WordCountExample() { #if local // This overload runs the computation on your local computer using a single worker var config = new DryadLinqContext(1); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #else #if azure string clusterName = "Replace with your HDInsight 3.1 cluster name"; // to use the davinci.txt example input below, select your cluster's default // storage account and container, which automatically includes the sample text string accountName = "Replace with a storage account name"; string containerName = "Replace with a storage container name"; // This overload creates an Azure-based computation var config = new DryadLinqContext(clusterName); config.JobFriendlyName = "DryadLINQ Sample Wordcount"; // plain text files should be read as type LineRecord var input = config.FromStore<LineRecord>(Utils.ToAzureUri(accountName, containerName, "example/data/gutenberg/davinci.txt")); #else // to use a yarn cluster, fill in the username, resource node machine name and port, and name node and hdfs port below (use -1 for the default hdfs port). string user = "Replace with your username"; string resourceNode = "Replace with the name of the computer your resource node is running on"; int rmPort = 8088; string nameNode = "Replace with the name of the computer your name node is running on"; int hdfsPort = -1; // set the YARN queue to submit your job on below. Leave null to use the default queue string queue = null; // set the number of worker containers to start for the DryadLINQ job below int numberOfWorkers = 2; // set the amount of memory requested for the DryadLINQ job manager container below: 8GB should be enough for even the largest jobs, and 2GB will normally suffice int amMemoryMB = 2000; // set the amount of memory requested for the DryadLINQ worker containers below. The amount needed will depend on the code you are running int workerMemoryMB = 8000; // This overload runs the computation on your local computer using a single worker var cluster = new DryadLinqYarnCluster(user, numberOfWorkers, amMemoryMB, workerMemoryMB, queue, resourceNode, rmPort, nameNode, hdfsPort); var config = new DryadLinqContext(cluster); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #endif #endif var words = input.SelectMany(x => x.Line.Split(' ')); var groups = words.GroupBy(x => x); var counts = groups.Select(x => new KeyValuePair<string, int>(x.Key, x.Count())); var toOutput = counts.Select(x => new LineRecord(String.Format("{0}: {1}", x.Key, x.Value))); #if azure // the 'true' parameter to ToStore means the output will be over-written if you run // the job more than once var info = toOutput.ToStore(Utils.ToAzureUri(accountName, containerName, "wc-out.txt"), true).SubmitAndWait(); #else // any collection computed by the query can be materialized back at the client, // not just the 'output' collection. For large collections this is expensive! foreach (LineRecord line in toOutput) { Console.WriteLine(line.Line); } #endif }