using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace BayesSharp.Tokenizers
{
///
/// A simple regex-based whitespace tokenizer.
///
public class SimpleTextTokenizer : ITokenizer
{
private readonly Regex _wordRe = new Regex(@"\w+");
private readonly bool _convertToLower;
private readonly List _ignoreList;
public SimpleTextTokenizer(): this(true, null)
{
}
/// Tokens must be converted to lower case
/// Tokens that will be ignored
public SimpleTextTokenizer(bool convertToLower, List ignoreList)
{
_ignoreList = ignoreList;
_convertToLower = convertToLower;
}
/// String to be broken
public IEnumerable Tokenize(object input)
{
if (input.GetType() != typeof (string))
{
throw new FormatException(string.Format("Expected string, given {0}", input.GetType()));
}
var tokens = MatchTokens(input);
if (_ignoreList == null)
{
return tokens;
}
return tokens.Where(token => !_ignoreList.Contains(token));
}
private IEnumerable MatchTokens(object input)
{
foreach (Match match in _wordRe.Matches((string) input))
{
if (_convertToLower)
{
yield return match.Value.ToLower();
}
else
{
yield return match.Value;
}
}
}
}
}