6 Star 5 Fork 3

Gitee 极速下载 / Prose-Go

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/jdkato/prose
克隆/下载
segment.go 7.37 KB
一键复制 编辑 原始数据 按行查看 历史
Kvaz1r 提交于 2018-07-25 22:40 . Refactoring
// Copyright (c) 2015 Eric Bower
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package prose
import (
"regexp"
"strings"
"gopkg.in/neurosnap/sentences.v1"
"gopkg.in/neurosnap/sentences.v1/data"
)
// punktSentenceTokenizer is an extension of the Go implementation of the Punkt
// sentence tokenizer (https://github.com/neurosnap/sentences), with a few
// minor improvements (see https://github.com/neurosnap/sentences/pull/18).
type punktSentenceTokenizer struct {
tokenizer *sentences.DefaultSentenceTokenizer
}
// newPunktSentenceTokenizer creates a new PunktSentenceTokenizer and loads
// its English model.
func newPunktSentenceTokenizer() *punktSentenceTokenizer {
var pt punktSentenceTokenizer
var err error
pt.tokenizer, err = newSentenceTokenizer(nil)
checkError(err)
return &pt
}
// segment splits text into sentences.
func (p punktSentenceTokenizer) segment(text string) []Sentence {
tokens := p.tokenizer.Tokenize(text)
sents := make([]Sentence, len(tokens))
for i := range tokens {
sents[i] = Sentence{Text: strings.TrimSpace(tokens[i].Text)}
}
return sents
}
type wordTokenizer struct {
sentences.DefaultWordTokenizer
}
var reAbbr = regexp.MustCompile(`((?:[\w]\.)+[\w]*\.)`)
var reLooksLikeEllipsis = regexp.MustCompile(`(?:\.\s?){2,}\.`)
var reEntities = regexp.MustCompile(`Yahoo!`)
// English customized sentence tokenizer.
func newSentenceTokenizer(s *sentences.Storage) (*sentences.DefaultSentenceTokenizer, error) {
training := s
if training == nil {
b, err := data.Asset("data/english.json")
if err != nil {
return nil, err
}
training, err = sentences.LoadTraining(b)
if err != nil {
return nil, err
}
}
// supervisor abbreviations
abbrevs := []string{"sgt", "gov", "no", "mt"}
for _, abbr := range abbrevs {
training.AbbrevTypes.Add(abbr)
}
lang := sentences.NewPunctStrings()
word := newWordTokenizer(lang)
annotations := sentences.NewAnnotations(training, lang, word)
ortho := &sentences.OrthoContext{
Storage: training,
PunctStrings: lang,
TokenType: word,
TokenFirst: word,
}
multiPunct := &multiPunctWordAnnotation{
Storage: training,
TokenParser: word,
TokenGrouper: &sentences.DefaultTokenGrouper{},
Ortho: ortho,
}
annotations = append(annotations, multiPunct)
tokenizer := &sentences.DefaultSentenceTokenizer{
Storage: training,
PunctStrings: lang,
WordTokenizer: word,
Annotations: annotations,
}
return tokenizer, nil
}
func newWordTokenizer(p sentences.PunctStrings) *wordTokenizer {
word := &wordTokenizer{}
word.PunctStrings = p
return word
}
func (e *wordTokenizer) HasSentEndChars(t *sentences.Token) bool {
enders := []string{
`."`, `.)`, `.’`, `.”`,
`?`, `?"`, `?'`, `?)`, `?’`, `?”`,
`!`, `!"`, `!'`, `!)`, `!’`, `!”`,
}
for _, ender := range enders {
if strings.HasSuffix(t.Tok, ender) && !reEntities.MatchString(t.Tok) {
return true
}
}
parens := []string{
`.[`, `.(`, `."`,
`?[`, `?(`,
`![`, `!(`,
}
for _, paren := range parens {
if strings.Contains(t.Tok, paren) {
return true
}
}
return false
}
// MultiPunctWordAnnotation attempts to tease out custom Abbreviations such as
// "F.B.I."
type multiPunctWordAnnotation struct {
*sentences.Storage
sentences.TokenParser
sentences.TokenGrouper
sentences.Ortho
}
func (a *multiPunctWordAnnotation) Annotate(tokens []*sentences.Token) []*sentences.Token {
for _, tokPair := range a.TokenGrouper.Group(tokens) {
if len(tokPair) < 2 || tokPair[1] == nil {
tok := tokPair[0].Tok
if strings.Contains(tok, "\n") && strings.Contains(tok, " ") {
// We've mislabeled due to an errant newline.
tokPair[0].SentBreak = false
}
continue
}
a.tokenAnnotation(tokPair[0], tokPair[1])
}
return tokens
}
// looksInternal determines if tok's punctuation could appear
// sentence-internally (i.e., parentheses or quotations).
func looksInternal(tok string) bool {
internal := []string{")", `’`, `”`, `"`, `'`}
for _, punc := range internal {
if strings.HasSuffix(tok, punc) {
return true
}
}
return false
}
func (a *multiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Token) {
// This is an expensive calculation, so we only want to do it once.
var nextTyp string
// If both tokOne and tokTwo and periods, we're probably in an ellipsis
// that wasn't properly tokenized by `WordTokenizer`.
if strings.HasSuffix(tokOne.Tok, ".") && tokTwo.Tok == "." {
tokOne.SentBreak = false
tokTwo.SentBreak = false
return
}
isNonBreak := strings.HasSuffix(tokOne.Tok, ".") && !tokOne.SentBreak
isEllipsis := reLooksLikeEllipsis.MatchString(tokOne.Tok)
isInternal := tokOne.SentBreak && looksInternal(tokOne.Tok)
if isNonBreak || isEllipsis || isInternal {
nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo)
isStarter := a.SentStarters[nextTyp]
// If the tokOne looks like an ellipsis and tokTwo is either
// capitalized or a frequent sentence starter, break the sentence.
if isEllipsis {
if a.TokenParser.FirstUpper(tokTwo) || isStarter != 0 {
tokOne.SentBreak = true
return
}
}
// If the tokOne's sentence-breaking punctuation looks like it could
// occur sentence-internally, ensure that the following word is either
// capitalized or a frequent sentence starter.
if isInternal {
if a.TokenParser.FirstLower(tokTwo) && isStarter == 0 {
tokOne.SentBreak = false
return
}
}
// If the tokOne ends with a period but isn't marked as a sentence
// break, mark it if tokTwo is capitalized and can occur in _ORTHO_LC.
if isNonBreak && a.TokenParser.FirstUpper(tokTwo) {
if a.Storage.OrthoContext[nextTyp]&112 != 0 {
tokOne.SentBreak = true
}
}
}
if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 {
return
}
if a.IsInitial(tokOne) {
return
}
tokOne.Abbr = true
tokOne.SentBreak = false
// [4.1.1. Orthographic Heuristic] Check if there's
// orthogrpahic evidence about whether the next word
// starts a sentence or not.
isSentStarter := a.Ortho.Heuristic(tokTwo)
if isSentStarter == 1 {
tokOne.SentBreak = true
return
}
if nextTyp == "" {
nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo)
}
// [4.1.3. Frequent Sentence Starter Heruistic] If the
// next word is capitalized, and is a member of the
// frequent-sentence-starters list, then label tok as a
// sentence break.
if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 {
tokOne.SentBreak = true
return
}
}
Go
1
https://gitee.com/mirrors/Prose-Go.git
git@gitee.com:mirrors/Prose-Go.git
mirrors
Prose-Go
Prose-Go
master

搜索帮助