#region Copyright (c) 2004, Ryan Whitaker /********************************************************************************* ' ' Copyright (c) 2004 Ryan Whitaker ' ' This software is provided 'as-is', without any express or implied warranty. In no ' event will the authors be held liable for any damages arising from the use of this ' software. ' ' Permission is granted to anyone to use this software for any purpose, including ' commercial applications, and to alter it and redistribute it freely, subject to the ' following restrictions: ' ' 1. The origin of this software must not be misrepresented; you must not claim that ' you wrote the original software. If you use this software in a product, an ' acknowledgment (see the following) in the product documentation is required. ' ' This product uses software written by the developers of NClassifier ' (http://nclassifier.sourceforge.net). NClassifier is a .NET port of the Nick ' Lothian's Java text classification engine, Classifier4J ' (http://classifier4j.sourceforge.net). ' ' 2. Altered source versions must be plainly marked as such, and must not be ' misrepresented as being the original software. ' ' 3. This notice may not be removed or altered from any source distribution. ' '********************************************************************************/ #endregion using System; using System.Collections; using System.Text.RegularExpressions; namespace NClassifier { public class Utilities { /// <summary> /// Gets an array of sentences. /// </summary> /// <param name="input">A string that contains sentences.</param> /// <returns>An array of strings, each element containing a sentence.</returns> public static string[] GetSentences(string input) { if (input == null) return new string[0]; else { // split on a ".", a "!", a "?" followed by a space or EOL // the original Java regex was (.|!|?)+(s|z) string[] result = Regex.Split(input, @"(?:.|!|?)+(?:s+|z)"); // hacky... doing this to pass the unit tests ArrayList list = new ArrayList(); foreach (string s in result) if (s.Length > 0) list.Add(s); return (string[])list.ToArray(typeof(string)); } } } }
Find all unique words in an array of words.
#region Copyright (c) 2004, Ryan Whitaker
/*********************************************************************************
'
' Copyright (c) 2004 Ryan Whitaker
'
' This software is provided 'as-is', without any express or implied warranty. In no
' event will the authors be held liable for any damages arising from the use of this
' software.
'
' Permission is granted to anyone to use this software for any purpose, including
' commercial applications, and to alter it and redistribute it freely, subject to the
' following restrictions:
'
' 1. The origin of this software must not be misrepresented; you must not claim that
' you wrote the original software. If you use this software in a product, an
' acknowledgment (see the following) in the product documentation is required.
'
' This product uses software written by the developers of NClassifier
' (http://nclassifier.sourceforge.net). NClassifier is a .NET port of the Nick
' Lothian's Java text classification engine, Classifier4J
' (http://classifier4j.sourceforge.net).
'
' 2. Altered source versions must be plainly marked as such, and must not be
' misrepresented as being the original software.
'
' 3. This notice may not be removed or altered from any source distribution.
'
'********************************************************************************/
#endregion
using System;
using System.Collections;
using System.Text.RegularExpressions;
namespace NClassifier
{
public class Utilities
{
///
///
/// An array of strings.
///
public static string[] GetUniqueWords(string[] input)
{
if (input == null)
return new string[0];
else
{
ArrayList result = new ArrayList();
for (int i = 0; i < input.Length; i++)
if (!result.Contains(input[i]))
result.Add(input[i]);
return (string[])result.ToArray("".GetType());
}
}
}
}
[/csharp]
Count how many times a word appears in an array of words.
#region Copyright (c) 2004, Ryan Whitaker
/*********************************************************************************
'
' Copyright (c) 2004 Ryan Whitaker
'
' This software is provided 'as-is', without any express or implied warranty. In no
' event will the authors be held liable for any damages arising from the use of this
' software.
'
' Permission is granted to anyone to use this software for any purpose, including
' commercial applications, and to alter it and redistribute it freely, subject to the
' following restrictions:
'
' 1. The origin of this software must not be misrepresented; you must not claim that
' you wrote the original software. If you use this software in a product, an
' acknowledgment (see the following) in the product documentation is required.
'
' This product uses software written by the developers of NClassifier
' (http://nclassifier.sourceforge.net). NClassifier is a .NET port of the Nick
' Lothian's Java text classification engine, Classifier4J
' (http://classifier4j.sourceforge.net).
'
' 2. Altered source versions must be plainly marked as such, and must not be
' misrepresented as being the original software.
'
' 3. This notice may not be removed or altered from any source distribution.
'
'********************************************************************************/
#endregion
using System;
using System.Collections;
using System.Text.RegularExpressions;
namespace NClassifier
{
public class Utilities
{
///
///
/// The word to count.
/// A non-null array of words.
public static int CountWords(string word, string[] words)
{
// find the index of one of the items in the array
int itemIndex = Array.BinarySearch(words, word);
// iterate backwards until we find the first match
if (itemIndex > 0)
while (itemIndex > 0 && words[itemIndex] == word)
itemIndex–;
// now itemIndex is one item before the start of the words
int count = 0;
while (itemIndex < words.Length && itemIndex >= 0)
{
if (words[itemIndex] == word)
count++;
itemIndex++;
if (itemIndex < words.Length) if (words[itemIndex] != word) break; } return count; } } } [/csharp]
Helper class to split a long word into a single one.
/*
* Author: Kishore Reddy
* Url: http://commonlibrarynet.codeplex.com/
* Title: CommonLibrary.NET
* Copyright: ? 2009 Kishore Reddy
* License: LGPL License
* LicenseUrl: http://commonlibrarynet.codeplex.com/license
* Description: A C# based .NET 3.5 Open-Source collection of reusable components.
* Usage: Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an “AS IS” BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Text;
namespace GenericCode
{
///
/// Alternative to possibly using Regular expression.
///
public class TextSplitter
{
///
///
///
///
///
internal static int GetNumberOfTimesToSplit(int wordLength, int maxCharsInWord)
{
// Validate.
if (wordLength <= maxCharsInWord) return 0;
// Now calc.
int splitCount = wordLength / maxCharsInWord;
int leftOver = wordLength % maxCharsInWord;
if (leftOver > 0) splitCount++;
return splitCount;
}
///
///
/// The text to split.
/// 40 chars in each word.
/// ” “
///
internal static string SplitWord(string text, int charsPerWord, string spacer)
{
// Validate.
if (string.IsNullOrEmpty(text)) { return text; }
// Determine how many times we have to split.
int splitCount = GetNumberOfTimesToSplit(text.Length, charsPerWord);
// Validate.
if (splitCount == 0) return text;
// Use buffer instead of string concatenation.
StringBuilder buffer = new StringBuilder();
int currentPosition = 0;
// Split N number of times.
for (int count = 1; count <= splitCount; count++)
{
string word = (count < splitCount) ? text.Substring(currentPosition, charsPerWord) : text.Substring(currentPosition);
buffer.Append(word);
// Condition to prevent adding spacer at the end.
// This is to leave the supplied text the same except for splitting ofcourse.
if (count < splitCount) buffer.Append(spacer);
// Move to next split start position.
currentPosition += charsPerWord;
}
return buffer.ToString();
}
///
/// maximum allowed.
/// If found, splits the word.
///
///
///
///
public static string CheckAndSplitText(string text, int maxCharsInWord)
{
// Validate.
if (string.IsNullOrEmpty(text)) return text;
bool isSpacerNewLine = false;
int currentPosition = 0;
int ndxSpace = GetIndexOfSpacer(text, currentPosition, ref isSpacerNewLine);
// Case 1: Single long word.
if (ndxSpace < 0 && text.Length > maxCharsInWord) return SplitWord(text, maxCharsInWord, ” “);
StringBuilder buffer = new StringBuilder();
// Now go through all the text and check word and split.
while ((currentPosition < text.Length && ndxSpace > 0))
{
//Lenght of word
int wordLength = ndxSpace – (currentPosition);
string currentWord = text.Substring(currentPosition, wordLength);
string spacer = isSpacerNewLine ? Environment.NewLine : ” “;
if (wordLength > maxCharsInWord)
{
string splitWord = SplitWord(currentWord, maxCharsInWord, ” “);
buffer.Append(splitWord + spacer);
}
else
{
buffer.Append(currentWord + spacer);
}
currentPosition = (isSpacerNewLine) ? ndxSpace + 2 : ndxSpace + 1;
ndxSpace = GetIndexOfSpacer(text, (currentPosition), ref isSpacerNewLine);
}
// Final check.. no space found but check complete length now.
if (currentPosition < text.Length && ndxSpace < 0)
{
//Lenght of word
int wordLength = (text.Length) - currentPosition;
string currentWord = text.Substring(currentPosition, wordLength);
string spacer = isSpacerNewLine ? Environment.NewLine : " ";
if (wordLength > maxCharsInWord)
{
string splitWord = SplitWord(currentWord, maxCharsInWord, ” “);
buffer.Append(splitWord);
}
else
{
buffer.Append(currentWord);
}
}
return buffer.ToString();
}
///
///
///
///
///
public static int GetIndexOfSpacer(string txt, int currentPosition, ref bool isNewLine)
{
// Take the first spacer that you find. it could be eithr
// space or newline, if space is before the newline take space
// otherwise newline.
int ndxSpace = txt.IndexOf(” “, currentPosition);
int ndxNewLine = txt.IndexOf(Environment.NewLine, currentPosition);
bool hasSpace = ndxSpace > -1;
bool hasNewLine = ndxNewLine > -1;
isNewLine = false;
// Found both space and newline.
if (hasSpace && hasNewLine)
{
if (ndxSpace < ndxNewLine) { return ndxSpace; }
isNewLine = true;
return ndxNewLine;
}
// Found space only.
if (hasSpace && !hasNewLine) { return ndxSpace; }
// Found newline only.
if (!hasSpace && hasNewLine) { isNewLine = true; return ndxNewLine; }
// no space or newline.
return -1;
}
}
}
[/csharp]
Get the index of a spacer ( space” ” or newline )
/*
* Author: Kishore Reddy
* Url: http://commonlibrarynet.codeplex.com/
* Title: CommonLibrary.NET
* Copyright: ? 2009 Kishore Reddy
* License: LGPL License
* LicenseUrl: http://commonlibrarynet.codeplex.com/license
* Description: A C# based .NET 3.5 Open-Source collection of reusable components.
* Usage: Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an “AS IS” BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Text;
namespace GenericCode
{
public class StringHelpers
{
///
///
///
///
///
public static int GetIndexOfSpacer(string txt, int currentPosition, ref bool isNewLine)
{
// Take the first spacer that you find. it could be eithr
// space or newline, if space is before the newline take space
// otherwise newline.
int ndxSpace = txt.IndexOf(” “, currentPosition);
int ndxNewLine = txt.IndexOf(Environment.NewLine, currentPosition);
bool hasSpace = ndxSpace > -1;
bool hasNewLine = ndxNewLine > -1;
isNewLine = false;
// Found both space and newline.
if (hasSpace && hasNewLine)
{
if (ndxSpace < ndxNewLine) { return ndxSpace; }
isNewLine = true;
return ndxNewLine;
}
// Found space only.
if (hasSpace && !hasNewLine) { return ndxSpace; }
// Found newline only.
if (!hasSpace && hasNewLine) { isNewLine = true; return ndxNewLine; }
// no space or newline.
return -1;
}
}
}
[/csharp]
Convert the word(s) in the sentence to sentence case.
/*
* Author: Kishore Reddy
* Url: http://commonlibrarynet.codeplex.com/
* Title: CommonLibrary.NET
* Copyright: ? 2009 Kishore Reddy
* License: LGPL License
* LicenseUrl: http://commonlibrarynet.codeplex.com/license
* Description: A C# based .NET 3.5 Open-Source collection of reusable components.
* Usage: Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an “AS IS” BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Text;
namespace GenericCode
{
public class StringHelpers
{
///
/// UPPER = Upper
/// lower = Lower
/// MiXEd = Mixed
///
///
///
///
public static string ConvertToSentanceCase(string s, char delimiter)
{
// Check null/empty
if (string.IsNullOrEmpty(s))
return s;
s = s.Trim();
if (string.IsNullOrEmpty(s))
return s;
// Only 1 token
if (s.IndexOf(delimiter) < 0)
{
s = s.ToLower();
s = s[0].ToString().ToUpper() + s.Substring(1);
return s;
}
// More than 1 token.
string[] tokens = s.Split(delimiter);
StringBuilder buffer = new StringBuilder();
foreach (string token in tokens)
{
string currentToken = token.ToLower();
currentToken = currentToken[0].ToString().ToUpper() + currentToken.Substring(1);
buffer.Append(currentToken + delimiter);
}
s = buffer.ToString();
return s.TrimEnd(delimiter);
}
}
}
[/csharp]
Returns the defaultval if the val string is null or empty.
/* * Author: Kishore Reddy * Url: http://commonlibrarynet.codeplex.com/ * Title: CommonLibrary.NET * Copyright: ? 2009 Kishore Reddy * License: LGPL License * LicenseUrl: http://commonlibrarynet.codeplex.com/license * Description: A C# based .NET 3.5 Open-Source collection of reusable components. * Usage: Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.Text; namespace GenericCode { public class StringHelpers { /// <summary> /// Returns the defaultval if the val string is null or empty. /// Returns the val string otherwise. /// </summary> /// <param name="val"></param> /// <param name="defaultVal"></param> /// <returns></returns> public static string GetDefaultStringIfEmpty(string val, string defaultVal) { if (string.IsNullOrEmpty(val)) return defaultVal; return val; } } }