使用NLTK简化法语词性标注集

Question

使用NLTK简化法语词性标注集

6

如何简化斯坦福法语词性标记返回的部分？将英语句子读入到NLTK中，找到每个单词的词性是相当容易的，然后使用map_tag()来简化标记集：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."

path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

#define english and french taggers
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")

#each tuple in list_of_english_pos_tuples = (word, pos)
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))

simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]

print simplified_pos_tags_english

#output = [(u'the', u'DET'), (u'whole', u'ADJ'), (u'earth', u'NOUN'), (u'swarms', u'NOUN'), (u'with', u'ADP'), (u'living', u'NOUN'), (u'beings', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'plant', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'grain', u'NOUN'), (u'and', u'CONJ'), (u'leaf', u'NOUN'), (u',', u'.'), (u'supports', u'VERB'), (u'the', u'DET'), (u'life', u'NOUN'), (u'of', u'ADP'), (u'thousands', u'NOUN'), (u'.', u'.')]

但我不确定如何将以下代码返回的法语标签映射到通用标签集：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."

path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")

list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))

#up to this point all is well, but I'm not sure how to successfully create a simplified pos tagset with the French tuples
simplified_pos_tags_french = [(word, map_tag('SOME_ARGUMENT', 'universal', tag)) for word, tag in list_of_french_pos_tuples]
print simplified_pos_tags_french

有人知道如何简化斯坦福词性标注器中法语模型使用的默认标签集吗？非常感谢其他人可以提供的任何见解。

- duhaime

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- duhaime · Accepted Answer

我最终只是手动将斯坦福的POS标签映射到通用标签集上。说句题外话，上面的片段是一个稍微复杂一点的工作流程的一部分，旨在衡量法语和英语句子之间的句法相似性。以下是完整代码，以防对其他人有帮助：

#!/usr/bin/python
# -*- coding: utf-8 -*-

'''NLTK 3.0 offers map_tag, which maps the Penn Treebank Tag Set to the Universal Tagset, a course tag set with the following 12 tags:

VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation

We'll map Stanford's tag set to this tag set then compare the similarity between subregions of French and English sentences.'''

from __future__ import division
import os, math
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag
from collections import Counter

#########################
# Create Tagset Mapping #
#########################

def create_french_to_universal_dict():
    '''this function creates the dict we'll call below when we map french pos tags to the universal tag set'''
    french_to_universal = {}
    french_to_universal[u"ADJ"]    = u"ADJ"
    french_to_universal[u"ADJWH"]  = u"ADJ"
    french_to_universal[u"ADV"]    = u"ADV"
    french_to_universal[u"ADVWH"]  = u"ADV"
    french_to_universal[u"CC"]     = u"CONJ"    
    french_to_universal[u"CLO"]    = u"PRON"
    french_to_universal[u"CLR"]    = u"PRON"
    french_to_universal[u"CLS"]    = u"PRON"
    french_to_universal[u"CS"]     = u"CONJ"
    french_to_universal[u"DET"]    = u"DET"
    french_to_universal[u"DETWH"]  = u"DET"
    french_to_universal[u"ET"]     = u"X"
    french_to_universal[u"NC"]     = u"NOUN"
    french_to_universal[u"NPP"]    = u"NOUN"
    french_to_universal[u"P"]      = u"ADP"
    french_to_universal[u"PUNC"]   = u"."
    french_to_universal[u"PRO"]    = u"PRON"
    french_to_universal[u"PROREL"] = u"PRON"
    french_to_universal[u"PROWH"]  = u"PRON"
    french_to_universal[u"V"]      = u"VERB"
    french_to_universal[u"VIMP"]   = u"VERB"
    french_to_universal[u"VINF"]   = u"VERB"
    french_to_universal[u"VPP"]    = u"VERB"
    french_to_universal[u"VPR"]    = u"VERB"
    french_to_universal[u"VS"]     = u"VERB"
    #nb, I is not part of the universal tagset--interjections get mapped to X
    french_to_universal[u"I"]      = u"X"
    return french_to_universal

french_to_universal_dict = create_french_to_universal_dict()

def map_french_tag_to_universal(list_of_french_tag_tuples):
    '''this function reads in a list of tuples (word, pos) and returns the same list with pos mapped to universal tagset'''
    return [ (tup[0], french_to_universal_dict[ tup[1] ]) for tup in list_of_french_tag_tuples ]

###############################
# Define Similarity Functions #
###############################

def counter_cosine_similarity(c1, c2):
    '''this function reads in two counters and returns their cosine similarity'''
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

def longest_common_subsequence_length(a, b):
    '''this function reads in two lists and returns the length of their longest common subsequence'''
    table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
    for i, ca in enumerate(a, 1):
        for j, cb in enumerate(b, 1):
            table[i][j] = (
                table[i - 1][j - 1] + 1 if ca == cb else
                max(table[i][j - 1], table[i - 1][j]))
    return table[-1][-1]        

def longest_contiguous_subsequence_length(a, b):
    '''this function reads in two lists and returns the length of their longest contiguous subsequence'''
    table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
    l = 0
    for i, ca in enumerate(a, 1):
        for j, cb in enumerate(b, 1):
            if ca == cb:
                table[i][j] = table[i - 1][j - 1] + 1
                if table[i][j] > l:
                    l = table[i][j]
    return l

def calculate_syntactic_similarity(french_pos_tuples, english_pos_tuples):
    '''this function reads in two lists of (word, pos) tuples and returns their cosine similarity, logest_common_subsequence, and longest_common_contiguous_sequence''' 
    french_pos_list           = [tup[1] for tup in french_pos_tuples]
    english_pos_list          = [tup[1] for tup in english_pos_tuples]
    french_pos_counter        = Counter(french_pos_list)
    english_pos_counter       = Counter(english_pos_list)
    cosine_similarity         = counter_cosine_similarity(french_pos_counter, english_pos_counter)
    lc_subsequence            = longest_common_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))
    lc_contiguous_subsequence = longest_contiguous_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))   
    return cosine_similarity, lc_subsequence, lc_contiguous_subsequence 

########################### 
# Parse POS with Stanford #
###########################

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."
french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."

#specify paths 
path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

#define english and french taggers
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")
french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")

#each tuple in list_of_english_pos_tuples = (word, pos)
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))
list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))

#simplify each tagset
simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]
simplified_pos_tags_french = map_french_tag_to_universal( list_of_french_pos_tuples )

print calculate_syntactic_similarity(simplified_pos_tags_french, simplified_pos_tags_english)