6 Star 5 Fork 3

Gitee 极速下载 / Prose-Go

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/jdkato/prose
extract.go 12.85 KB
一键复制 编辑 原始数据 按行查看 历史
Eric Gaudet 提交于 2020-08-19 16:49 . Introduce Tokenizer interface (#61)
package prose
import (
var maxLogDiff = math.Log2(1e-30)
type mappedProbDist struct {
dict map[string]float64
log bool
func (m *mappedProbDist) prob(label string) float64 {
if p, found := m.dict[label]; found {
return math.Pow(2, p)
return 0.0
func newMappedProbDist(dict map[string]float64, normalize bool) *mappedProbDist {
if normalize {
values := make([]float64, len(dict))
i := 0
for _, v := range dict {
values[i] = v
sum := sumLogs(values)
if sum <= math.Inf(-1) {
p := math.Log2(1.0 / float64(len(dict)))
for k := range dict {
dict[k] = p
} else {
for k := range dict {
dict[k] -= sum
return &mappedProbDist{dict: dict, log: true}
type encodedValue struct {
key int
value int
type feature struct {
label string
features map[string]string
type featureSet []feature
var featureOrder = []string{
"bias", "en-wordlist", "nextpos", "nextword", "pos", "pos+prevtag",
"prefix3", "prevpos", "prevtag", "prevword", "shape", "shape+prevtag",
"suffix3", "word", "word+nextpos", "word.lower", "wordlen"}
// binaryMaxentClassifier is a feature encoding that generates vectors
// containing binary joint-features of the form:
// | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
// | {
// | { 0 otherwise
// where `fname` is the name of an input-feature, `fval` is a value for that
// input-feature, and `label` is a label.
// See https://www.nltk.org/_modules/nltk/classify/maxent.html for more
// information.
type binaryMaxentClassifier struct {
cardinality int
labels []string
mapping map[string]int
weights []float64
// newMaxentClassifier creates a new binaryMaxentClassifier from the provided
// input values.
func newMaxentClassifier(
weights []float64,
mapping map[string]int,
labels []string) *binaryMaxentClassifier {
set := make(map[string]struct{})
for label := range mapping {
k := strings.Split(label, "-")[0]
set[k] = struct{}{}
return &binaryMaxentClassifier{
len(set) + 1,
// marshal saves the model to disk.
func (m *binaryMaxentClassifier) marshal(path string) error {
folder := filepath.Join(path, "Maxent")
err := os.Mkdir(folder, os.ModePerm)
for i, entry := range []string{"labels", "mapping", "weights"} {
component, _ := os.Create(filepath.Join(folder, entry+".gob"))
encoder := gob.NewEncoder(component)
if i == 0 {
} else if i == 1 {
} else {
return err
// entityExtracter is a maximum entropy classifier.
// See https://www.nltk.org/_modules/nltk/classify/maxent.html for more
// information.
type entityExtracter struct {
model *binaryMaxentClassifier
// newEntityExtracter creates a new entityExtracter using the default model.
func newEntityExtracter() *entityExtracter {
var mapping map[string]int
var weights []float64
var labels []string
dec := getAsset("Maxent", "mapping.gob")
dec = getAsset("Maxent", "weights.gob")
dec = getAsset("Maxent", "labels.gob")
return &entityExtracter{model: newMaxentClassifier(weights, mapping, labels)}
// newTrainedEntityExtracter creates a new EntityExtracter using the given
// model.
func newTrainedEntityExtracter(model *binaryMaxentClassifier) *entityExtracter {
return &entityExtracter{model: model}
// chunk finds named-entity "chunks" from the given, pre-labeled tokens.
func (e *entityExtracter) chunk(tokens []*Token) []Entity {
entities := []Entity{}
end := ""
parts := []*Token{}
idx := 0
for _, tok := range tokens {
label := tok.Label
if (label != "O" && label != end) ||
(idx > 0 && tok.Tag == parts[idx-1].Tag) ||
(idx > 0 && tok.Tag == "CD" && parts[idx-1].Label != "O") {
end = strings.Replace(label, "B", "I", 1)
parts = append(parts, tok)
} else if (label == "O" && end != "") || label == end {
// We've found the end of an entity.
if label != "O" {
parts = append(parts, tok)
entities = append(entities, coalesce(parts))
end = ""
parts = []*Token{}
idx = 0
return entities
func (m *binaryMaxentClassifier) encode(features map[string]string, label string) []encodedValue {
encoding := []encodedValue{}
for _, key := range featureOrder {
val := features[key]
entry := strings.Join([]string{key, val, label}, "-")
if ret, found := m.mapping[entry]; found {
encoding = append(encoding, encodedValue{
key: ret,
value: 1})
return encoding
func (m *binaryMaxentClassifier) encodeGIS(features map[string]string, label string) []encodedValue {
encoding := m.encode(features, label)
length := len(m.mapping)
total := 0
for _, v := range encoding {
total += v.value
encoding = append(encoding, encodedValue{
key: length,
value: m.cardinality - total})
return encoding
func adjustPos(text string, start, end int) (int, int) {
index, left, right := -1, 0, 0
_ = strings.Map(func(r rune) rune {
if unicode.IsSpace(r) {
if index < start {
if index < end {
return -1
return r
}, text)
return start - left, end - right
func extractFeatures(tokens []*Token, history []string) []feature {
features := make([]feature, len(tokens))
for i := range tokens {
features[i] = feature{
label: history[i],
features: extract(i, tokens, history)}
return features
func assignLabels(tokens []*Token, entity *EntityContext) []string {
history := make([]string, len(tokens))
for i := range tokens {
history[i] = "O"
if entity.Accept {
for _, span := range entity.Spans {
start, end := adjustPos(entity.Text, span.Start, span.End)
index := 0
for i, tok := range tokens {
if index == start {
history[i] = "B-" + span.Label
} else if index > start && index < end {
history[i] = "I-" + span.Label
index += len(tok.Text)
return history
func makeCorpus(data []EntityContext, tagger *perceptronTagger, tokenizer Tokenizer) featureSet {
corpus := featureSet{}
for i := range data {
entry := &data[i]
tokens := tagger.tag(tokenizer.Tokenize(entry.Text))
history := assignLabels(tokens, entry)
for _, element := range extractFeatures(tokens, history) {
corpus = append(corpus, element)
return corpus
func extracterFromData(corpus featureSet) *entityExtracter {
encoding := encode(corpus)
cInv := 1.0 / float64(encoding.cardinality)
empfreq := empiricalCount(corpus, encoding)
rows, _ := empfreq.Dims()
unattested := []int{}
for index := 0; index < rows; index++ {
if empfreq.At(index, 0) == 0.0 {
unattested = append(unattested, index)
empfreq.SetVec(index, math.Log2(empfreq.At(index, 0)))
weights := make([]float64, rows)
for _, idx := range unattested {
weights[idx] = math.Inf(-1)
encoding.weights = weights
classifier := newTrainedEntityExtracter(encoding)
for index := 0; index < 100; index++ {
est := estCount(classifier, corpus, encoding)
for _, idx := range unattested {
est.SetVec(idx, est.AtVec(idx)+1)
rows, _ := est.Dims()
for index := 0; index < rows; index++ {
est.SetVec(index, math.Log2(est.At(index, 0)))
weights = classifier.model.weights
est.SubVec(empfreq, est)
est.ScaleVec(cInv, est)
for index := 0; index < len(weights); index++ {
weights[index] += est.AtVec(index)
classifier.model.weights = weights
return classifier
func estCount(
classifier *entityExtracter,
corpus featureSet,
encoder *binaryMaxentClassifier,
) *mat.VecDense {
count := mat.NewVecDense(len(encoder.mapping)+1, nil)
for _, entry := range corpus {
pdist := classifier.probClassify(entry.features)
for label := range pdist.dict {
prob := pdist.prob(label)
for _, enc := range encoder.encodeGIS(entry.features, label) {
out := count.AtVec(enc.key) + (prob * float64(enc.value))
count.SetVec(enc.key, out)
return count
func (e *entityExtracter) classify(tokens []*Token) []*Token {
length := len(tokens)
history := make([]string, 0, length)
for i := 0; i < length; i++ {
scores := make(map[string]float64)
features := extract(i, tokens, history)
for _, label := range e.model.labels {
total := 0.0
for _, encoded := range e.model.encode(features, label) {
total += e.model.weights[encoded.key] * float64(encoded.value)
scores[label] = total
label := max(scores)
tokens[i].Label = label
history = append(history, simplePOS(label))
return tokens
func (e *entityExtracter) probClassify(features map[string]string) *mappedProbDist {
scores := make(map[string]float64)
for _, label := range e.model.labels {
vec := e.model.encodeGIS(features, label)
total := 0.0
for _, entry := range vec {
total += e.model.weights[entry.key] * float64(entry.value)
scores[label] = total
//&mappedProbDist{dict: scores, log: true}
return newMappedProbDist(scores, true)
func parseEntities(ents []string) string {
if stringInSlice("B-PERSON", ents) && len(ents) == 2 {
// PERSON takes precedence because it's hard to identify.
return "PERSON"
return strings.Split(ents[0], "-")[1]
func coalesce(parts []*Token) Entity {
length := len(parts)
labels := make([]string, length)
tokens := make([]string, length)
for i, tok := range parts {
tokens[i] = tok.Text
labels[i] = tok.Label
return Entity{
Label: parseEntities(labels),
Text: strings.Join(tokens, " "),
func extract(i int, ctx []*Token, history []string) map[string]string {
feats := make(map[string]string)
word := ctx[i].Text
prevShape := "None"
feats["bias"] = "True"
feats["word"] = word
feats["pos"] = ctx[i].Tag
feats["en-wordlist"] = isBasic(word)
feats["word.lower"] = strings.ToLower(word)
feats["suffix3"] = nSuffix(word, 3)
feats["prefix3"] = nPrefix(word, 3)
feats["shape"] = shape(word)
feats["wordlen"] = strconv.Itoa(len(word))
if i == 0 {
feats["prevtag"] = "None"
feats["prevword"], feats["prevpos"] = "None", "None"
} else if i == 1 {
feats["prevword"] = strings.ToLower(ctx[i-1].Text)
feats["prevpos"] = ctx[i-1].Tag
feats["prevtag"] = history[i-1]
} else {
feats["prevword"] = strings.ToLower(ctx[i-1].Text)
feats["prevpos"] = ctx[i-1].Tag
feats["prevtag"] = history[i-1]
prevShape = shape(ctx[i-1].Text)
if i == len(ctx)-1 {
feats["nextword"], feats["nextpos"] = "None", "None"
} else {
feats["nextword"] = strings.ToLower(ctx[i+1].Text)
feats["nextpos"] = strings.ToLower(ctx[i+1].Tag)
feats["word+nextpos"] = strings.Join(
[]string{feats["word.lower"], feats["nextpos"]}, "+")
feats["pos+prevtag"] = strings.Join(
[]string{feats["pos"], feats["prevtag"]}, "+")
feats["shape+prevtag"] = strings.Join(
[]string{prevShape, feats["prevtag"]}, "+")
return feats
func shape(word string) string {
if isNumeric(word) {
return "number"
} else if match, _ := regexp.MatchString(`\W+$`, word); match {
return "punct"
} else if match, _ := regexp.MatchString(`\w+$`, word); match {
if strings.ToLower(word) == word {
return "downcase"
} else if strings.Title(word) == word {
return "upcase"
} else {
return "mixedcase"
return "other"
func simplePOS(pos string) string {
if strings.HasPrefix(pos, "V") {
return "v"
return strings.Split(pos, "-")[0]
func encode(corpus featureSet) *binaryMaxentClassifier {
mapping := make(map[string]int) // maps (fname-fval-label) -> fid
count := make(map[string]int) // maps (fname, fval) -> count
weights := []float64{}
labels := []string{}
for _, entry := range corpus {
label := entry.label
if !stringInSlice(label, labels) {
labels = append(labels, label)
for _, fname := range featureOrder {
fval := entry.features[fname]
key := strings.Join([]string{fname, fval}, "-")
entry := strings.Join([]string{fname, fval, label}, "-")
if _, found := mapping[entry]; !found {
mapping[entry] = len(mapping)
return newMaxentClassifier(weights, mapping, labels)
func empiricalCount(corpus featureSet, encoding *binaryMaxentClassifier) *mat.VecDense {
count := mat.NewVecDense(len(encoding.mapping)+1, nil)
for _, entry := range corpus {
for _, encoded := range encoding.encodeGIS(entry.features, entry.label) {
idx := encoded.key
count.SetVec(idx, count.AtVec(idx)+float64(encoded.value))
return count
func addLogs(x, y float64) float64 {
if x < y+maxLogDiff {
return y
} else if y < x+maxLogDiff {
return x
base := math.Min(x, y)
return base + math.Log2(math.Pow(2, x-base)+math.Pow(2, y-base))
func sumLogs(logs []float64) float64 {
if len(logs) == 0 {
return math.Inf(-1)
sum := logs[0]
for _, log := range logs[1:] {
sum = addLogs(sum, log)
return sum
