6 Star 5 Fork 3

Gitee 极速下载 / Prose-Go

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/jdkato/prose
克隆/下载
tag.go 10.10 KB
一键复制 编辑 原始数据 按行查看 历史
Joseph Kato 提交于 2020-06-16 16:29 . Clean up superfluous deps
// Copyright 2013 Matthew Honnibal
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
package prose
import (
"math"
"regexp"
"strconv"
"strings"
)
// TupleSlice is a slice of tuples in the form (words, tags).
type TupleSlice [][][]string
// Len returns the length of a Tuple.
func (t TupleSlice) Len() int { return len(t) }
// Swap switches the ith and jth elements in a Tuple.
func (t TupleSlice) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
// ReadTagged converts pre-tagged input into a TupleSlice suitable for training.
func ReadTagged(text, sep string) TupleSlice {
lines := strings.Split(text, "\n")
length := len(lines)
t := make(TupleSlice, length)
for i, sent := range lines {
set := strings.Split(sent, " ")
length = len(set)
tokens := make([]string, length)
tags := make([]string, length)
for j, token := range set {
parts := strings.Split(token, sep)
tokens[j] = parts[0]
tags[j] = parts[1]
}
t[i] = [][]string{tokens, tags}
}
return t
}
var none = regexp.MustCompile(`^(?:0|\*[\w?]\*|\*\-\d{1,3}|\*[A-Z]+\*\-\d{1,3}|\*)$`)
var keep = regexp.MustCompile(`^\-[A-Z]{3}\-$`)
// averagedPerceptron is a Averaged Perceptron classifier.
type averagedPerceptron struct {
classes []string
stamps map[string]float64
totals map[string]float64
tagMap map[string]string
weights map[string]map[string]float64
// TODO: Training
//
// instances float64
}
// newAveragedPerceptron creates a new AveragedPerceptron model.
func newAveragedPerceptron(weights map[string]map[string]float64,
tags map[string]string, classes []string) *averagedPerceptron {
return &averagedPerceptron{
totals: make(map[string]float64), stamps: make(map[string]float64),
classes: classes, tagMap: tags, weights: weights}
}
/* TODO: Training API
"github.com/shogo82148/go-shuffle"
// marshal saves the model to disk.
func (m *averagedPerceptron) marshal(path string) error {
folder := filepath.Join(path, "AveragedPerceptron")
err := os.Mkdir(folder, os.ModePerm)
for i, entry := range []string{"weights", "tags", "classes"} {
component, _ := os.Create(filepath.Join(folder, entry+".gob"))
encoder := gob.NewEncoder(component)
if i == 0 {
checkError(encoder.Encode(m.weights))
} else if i == 1 {
checkError(encoder.Encode(m.tagMap))
} else {
checkError(encoder.Encode(m.classes))
}
}
return err
}
// train an Averaged Perceptron model based on sentences.
func (pt *perceptronTagger) train(sentences TupleSlice, iterations int) {
var guess string
var found bool
pt.makeTagMap(sentences)
for i := 0; i < iterations; i++ {
for _, tuple := range sentences {
words, tags := tuple[0], tuple[1]
p1, p2 := "-START-", "-START2-"
context := []string{p1, p2}
for _, w := range words {
if w == "" {
continue
}
context = append(context, normalize(w))
}
context = append(context, []string{"-END-", "-END2-"}...)
for i, word := range words {
if guess, found = pt.tagMap[word]; !found {
feats := featurize(i, context, word, p1, p2)
guess = pt.model.predict(feats)
pt.model.update(tags[i], guess, feats)
}
p2 = p1
p1 = guess
}
}
shuffle.Shuffle(sentences)
}
pt.model.averageWeights()
}
func (m *averagedPerceptron) averageWeights() {
for feat, weights := range m.weights {
newWeights := make(map[string]float64)
for class, weight := range weights {
key := feat + "-" + class
total := m.totals[key]
total += (m.instances - m.stamps[key]) * weight
averaged, _ := stats.Round(total/m.instances, 3)
if averaged != 0.0 {
newWeights[class] = averaged
}
}
m.weights[feat] = newWeights
}
}
// newTrainedPerceptronTagger creates a new PerceptronTagger using the given
// model.
func newTrainedPerceptronTagger(model *averagedPerceptron) *perceptronTagger {
return &perceptronTagger{model: model}
}
func (pt *perceptronTagger) makeTagMap(sentences TupleSlice) {
counts := make(map[string]map[string]int)
for _, tuple := range sentences {
words, tags := tuple[0], tuple[1]
for i, word := range words {
tag := tags[i]
if counts[word] == nil {
counts[word] = make(map[string]int)
}
counts[word][tag]++
pt.model.addClass(tag)
}
}
for word, tagFreqs := range counts {
tag, mode := maxValue(tagFreqs)
n := float64(sumValues(tagFreqs))
if n >= 20 && (float64(mode)/n) >= 0.97 {
pt.tagMap[word] = tag
}
}
}
func sumValues(m map[string]int) int {
sum := 0
for _, v := range m {
sum += v
}
return sum
}
func maxValue(m map[string]int) (string, int) {
maxValue := 0
key := ""
for k, v := range m {
if v >= maxValue {
maxValue = v
key = k
}
}
return key, maxValue
}
func get(k string, m map[string]float64) float64 {
if v, ok := m[k]; ok {
return v
}
return 0.0
}
func (m *averagedPerceptron) update(truth, guess string, feats map[string]float64) {
m.instances++
if truth == guess {
return
}
for f := range feats {
weights := make(map[string]float64)
if val, ok := m.weights[f]; ok {
weights = val
} else {
m.weights[f] = weights
}
m.updateFeat(truth, f, get(truth, weights), 1.0)
m.updateFeat(guess, f, get(guess, weights), -1.0)
}
}
func (m *averagedPerceptron) updateFeat(c, f string, v, w float64) {
key := f + "-" + c
m.totals[key] = (m.instances - m.stamps[key]) * w
m.stamps[key] = m.instances
m.weights[f][c] = w + v
}
func (m *averagedPerceptron) addClass(class string) {
if !stringInSlice(class, m.classes) {
m.classes = append(m.classes, class)
}
}*/
// perceptronTagger is a port of Textblob's "fast and accurate" POS tagger.
// See https://github.com/sloria/textblob-aptagger for details.
type perceptronTagger struct {
model *averagedPerceptron
}
// newPerceptronTagger creates a new PerceptronTagger and loads the built-in
// AveragedPerceptron model.
func newPerceptronTagger() *perceptronTagger {
var wts map[string]map[string]float64
var tags map[string]string
var classes []string
dec := getAsset("AveragedPerceptron", "classes.gob")
checkError(dec.Decode(&classes))
dec = getAsset("AveragedPerceptron", "tags.gob")
checkError(dec.Decode(&tags))
dec = getAsset("AveragedPerceptron", "weights.gob")
checkError(dec.Decode(&wts))
return &perceptronTagger{model: newAveragedPerceptron(wts, tags, classes)}
}
// tag takes a slice of words and returns a slice of tagged tokens.
func (pt *perceptronTagger) tag(tokens []*Token) []*Token {
var tag string
var found bool
p1, p2 := "-START-", "-START2-"
length := len(tokens) + 4
context := make([]string, length)
context[0] = p1
context[1] = p2
for i, t := range tokens {
context[i+2] = normalize(t.Text)
}
context[length-2] = "-END-"
context[length-1] = "-END2-"
for i := 0; i < len(tokens); i++ {
word := tokens[i].Text
if word == "-" {
tag = "-"
} else if _, ok := emoticons[word]; ok {
tag = "SYM"
} else if strings.HasPrefix(word, "@") {
// TODO: URLs and emails?
tag = "NN"
} else if none.MatchString(word) {
tag = "-NONE-"
} else if keep.MatchString(word) {
tag = word
} else if tag, found = pt.model.tagMap[word]; !found {
tag = pt.model.predict(featurize(i, context, word, p1, p2))
}
tokens[i].Tag = tag
p2 = p1
p1 = tag
}
return tokens
}
func (m *averagedPerceptron) predict(features map[string]float64) string {
var weights map[string]float64
var found bool
scores := make(map[string]float64)
for feat, value := range features {
if weights, found = m.weights[feat]; !found || value == 0 {
continue
}
for label, weight := range weights {
scores[label] += value * weight
}
}
return max(scores)
}
func max(scores map[string]float64) string {
var class string
max := math.Inf(-1)
for label, value := range scores {
if value > max {
max = value
class = label
}
}
return class
}
func featurize(i int, ctx []string, w, p1, p2 string) map[string]float64 {
feats := make(map[string]float64)
suf := min(len(w), 3)
i = min(len(ctx)-2, i+2)
iminus := min(len(ctx[i-1]), 3)
iplus := min(len(ctx[i+1]), 3)
feats = add([]string{"bias"}, feats)
feats = add([]string{"i suffix", w[len(w)-suf:]}, feats)
feats = add([]string{"i pref1", string(w[0])}, feats)
feats = add([]string{"i-1 tag", p1}, feats)
feats = add([]string{"i-2 tag", p2}, feats)
feats = add([]string{"i tag+i-2 tag", p1, p2}, feats)
feats = add([]string{"i word", ctx[i]}, feats)
feats = add([]string{"i-1 tag+i word", p1, ctx[i]}, feats)
feats = add([]string{"i-1 word", ctx[i-1]}, feats)
feats = add([]string{"i-1 suffix", ctx[i-1][len(ctx[i-1])-iminus:]}, feats)
feats = add([]string{"i-2 word", ctx[i-2]}, feats)
feats = add([]string{"i+1 word", ctx[i+1]}, feats)
feats = add([]string{"i+1 suffix", ctx[i+1][len(ctx[i+1])-iplus:]}, feats)
feats = add([]string{"i+2 word", ctx[i+2]}, feats)
return feats
}
func add(args []string, features map[string]float64) map[string]float64 {
key := strings.Join(args, " ")
features[key]++
return features
}
func normalize(word string) string {
if word == "" {
return word
}
first := string(word[0])
if strings.Contains(word, "-") && first != "-" {
return "!HYPHEN"
} else if _, err := strconv.Atoi(word); err == nil && len(word) == 4 {
return "!YEAR"
} else if _, err := strconv.Atoi(first); err == nil {
return "!DIGITS"
}
return strings.ToLower(word)
}
Go
1
https://gitee.com/mirrors/Prose-Go.git
git@gitee.com:mirrors/Prose-Go.git
mirrors
Prose-Go
Prose-Go
master

搜索帮助