diasks2 / pragmatic_tokenizer

A multilingual tokenizer to split a string into tokens
MIT License
90 stars 11 forks source link

more specs #22

Closed maia closed 8 years ago

maia commented 8 years ago

Here are 10 more specs that probably should pass.

I've added number three twice, as in my text editor I'm not being displayed a flag but two chars: first the letter A in a square, then the letter T in a square. I'm unsure if github does some magic, therefor there's also 3b.

#1
it 'does not clean mentions' do
  text = "@_someone_ because @someone and @_someone was taken"
  pt = PragmaticTokenizer::Tokenizer.new(text, mentions: :keep_original, clean: true)
  expect(pt.tokenize).to eq(
    ["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"]
  )
end

#2
it 'cleans (r) and (c)' do
  text = "the oscar® night ©companyname"
  pt = PragmaticTokenizer::Tokenizer.new(text, clean: true)
  expect(pt.tokenize).to eq(
    ["the", "oscar", "night", "companyname"]
  )
end

#3a
it 'cleans letters in boxes 1' do
  text = "making🇦🇹postcards"
  pt = PragmaticTokenizer::Tokenizer.new(text, clean: true)
  expect(pt.tokenize).to eq(
    ["making", "postcards"]
  )
end

#3b
it 'cleans letters in boxes 2' do
  text = [109, 97, 107, 105, 110, 103, 127462, 127481, 112, 111, 115, 116, 99, 97, 114, 100, 115].pack("U*")
  pt = PragmaticTokenizer::Tokenizer.new(text, clean: true)
  expect(pt.tokenize).to eq(
    ["making", "postcards"]
  )
end

#4
it 'preserves emoticons' do
  text = "lol :-D"
  pt = PragmaticTokenizer::Tokenizer.new(text, downcase: false)
  expect(pt.tokenize).to eq(
    ["lol", ":-D"]
  )
end

#5
it 'removes colons' do
  text = "At 19:30 o'clock: Mad Max: Fury Road"
  pt = PragmaticTokenizer::Tokenizer.new(text, clean: true)
  expect(pt.tokenize).to eq(
    ["at", "19:30", "o'clock", "mad", "max", "fury", "road"]
  )
end

#6
it 'removes double single quotes' do
  text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
  pt = PragmaticTokenizer::Tokenizer.new(text, punctuation: :none, clean: true)
  expect(pt.tokenize).to eq(
    ["strong", "statement", "in", "the", "day", "the", "earth", "caught", "fire", "1961"]
  )
end

#7
it 'removes a hyphen prefix 1' do
  text = "Geopol.-Strategy"
  pt = PragmaticTokenizer::Tokenizer.new(text, punctuation: :none, clean: true, long_word_split: 50), 
  expect(pt.tokenize).to eq(
    ["geopol", "strategy"]
  )
end

#8
it 'removes a hyphen prefix 2' do
  text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
  pt = PragmaticTokenizer::Tokenizer.new(text, punctuation: :none, clean: true, long_word_split: 50), 
  expect(pt.tokenize).to eq(
    ["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"]
  )
end

#9
it 'removes a hyphen prefix 3' do
  text = "women's clothes and –shoes needed"
  pt = PragmaticTokenizer::Tokenizer.new(text, clean: true), 
  expect(pt.tokenize).to eq(
    ["women's", "clothes", "and", "shoes", "needed"]
  )
end

#10
it 'treats abbreviations always the same' do
  text = "U.S.A. U.S.A. U.S.A."
  pt = PragmaticTokenizer::Tokenizer.new(text), 
  expect(pt.tokenize).to eq(
    ["u.s.a.", "u.s.a.", "u.s.a."]
  )
end
diasks2 commented 8 years ago

I'm going to move 4 to a separate issue as I currently do not have any good way to detect emoticons.

diasks2 commented 8 years ago

OK, these should be passing except for 4 which I have moved to a separate issue: https://github.com/diasks2/pragmatic_tokenizer/commit/8282cab7b8b7360e70ff568316d65547ddf7ecec