using System; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; namespace BayesSharp.Tokenizers { /// /// A simple regex-based whitespace tokenizer. /// public class SimpleTextTokenizer : ITokenizer { private readonly Regex _wordRe = new Regex(@"\w+"); private readonly bool _convertToLower; private readonly List _ignoreList; public SimpleTextTokenizer(): this(true, null) { } /// Tokens must be converted to lower case /// Tokens that will be ignored public SimpleTextTokenizer(bool convertToLower, List ignoreList) { _ignoreList = ignoreList; _convertToLower = convertToLower; } /// String to be broken public IEnumerable Tokenize(object input) { if (input.GetType() != typeof (string)) { throw new FormatException(string.Format("Expected string, given {0}", input.GetType())); } var tokens = MatchTokens(input); if (_ignoreList == null) { return tokens; } return tokens.Where(token => !_ignoreList.Contains(token)); } private IEnumerable MatchTokens(object input) { foreach (Match match in _wordRe.Matches((string) input)) { if (_convertToLower) { yield return match.Value.ToLower(); } else { yield return match.Value; } } } } }