lingpy / lingpy
Showing 2 of 3 files from the diff.

@@ -6,10 +6,11 @@
Loading
6 6
from lingpy.sequence.sound_classes import codepoint, clean_string, token2class
7 7
from lingpy import log
8 8
from lingpy.util import pb
9 -
9 +
import unicodedata
10 10
11 11
def simple_profile(wordlist, ref='ipa', semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False,
12 12
        brackets=None, splitters='/,;~', merge_geminates=True,
13 +
        normalization_form="NFC",
13 14
        bad_word="<???>", bad_sound="<?>", clts=None, unknown_sound="!{0}"):
14 15
    """
15 16
    Create an initial Orthography Profile using Lingpy's clean_string procedure.
@@ -60,12 +61,14 @@
Loading
60 61
    bad_words = set()
61 62
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
62 63
    profile = defaultdict(int)
63 -
    words = [wordlist[idx, ref] for idx in wordlist]
64 +
    words = [unicodedata.normalize(normalization_form, wordlist[idx, ref]) 
65 +
            for idx in wordlist]
64 66
    for word in pb(words, desc='iterating over words'):
65 67
        if isinstance(word, list):
66 68
            word = ' '.join(word)
67 69
        cleaned_string = clean_string(word, semi_diacritics=semi_diacritics,
68 -
                merge_vowels=merge_vowels, brackets=None, ignore_brackets=False,
70 +
                merge_vowels=merge_vowels, 
71 +
                normalization_form=normalization_form, brackets=None, ignore_brackets=False,
69 72
                split_entries=False, preparse=None, rules=None,
70 73
                merge_geminates=merge_geminates)[0]
71 74
@@ -103,7 +106,8 @@
Loading
103 106
        semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None,
104 107
        splitters='/,;~', merge_geminates=True, clts=False,
105 108
        bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2,
106 -
        max_entries=100):
109 +
        max_entries=100,
110 +
        normalization_form="NFC"):
107 111
    """
108 112
    Create an advanced Orthography Profile with context and doculect information.
109 113
@@ -164,10 +168,12 @@
Loading
164 168
        log.info('processing {0}-{1}'.format(idx, word))
165 169
        if isinstance(word, list):
166 170
            word = ' '.join(word)
171 +
        word = unicodedata.normalize(normalization_form, word)
167 172
        if word.strip():
168 173
            try:
169 174
                cleaned_string = clean_string(word, semi_diacritics=semi_diacritics,
170 175
                        merge_vowels=merge_vowels, brackets=None, ignore_brackets=False,
176 +
                        normalization_form=normalization_form,
171 177
                        split_entries=False, preparse=None, rules=None,
172 178
                        merge_geminates=merge_geminates)[0].split(' ')
173 179
@@ -180,9 +186,11 @@
Loading
180 186
                    context_post = (len(cleaned_string)-1) * [''] + ['$']
181 187
                    for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string):
182 188
                        profile[ctxA+segment+ctxB] += [(language, word)]
183 -
                    for segment in [x for x in word if x not in ' '.join(cleaned_string)]:
184 -
                        profile[segment] += [(language, word)]
185 -
                        nulls.add(segment)
189 +
                    for segment in [x for x in word if x not in
190 +
                            ' '.join(cleaned_string)]:
191 +
                        if segment.strip():
192 +
                            profile[segment] += [(language, word)]
193 +
                            nulls.add(segment)
186 194
            except:
187 195
                errors.add(idx)
188 196
                log.warning('problem parsing {0}'.format(word))

@@ -294,6 +294,9 @@
Loading
294 294
                p, 'merge', False, 'Merge vowels in profile creation.',
295 295
                short_opt='m'
296 296
                )
297 +
        add_option(
298 +
                p, 'normalize', 'NFC', 'Set normalization of wordlist.',
299 +
                short_opt='n')
297 300
        add_option(
298 301
                p, 'context', False, 'Create profile with context.', 
299 302
                short_opt='c')
@@ -337,10 +340,12 @@
Loading
337 340
            wl = lingpy.basic.wordlist.Wordlist(D)
338 341
        if args.context:
339 342
            for line in lingpy.sequence.profile.context_profile(wl, ref=args.column,
340 -
                    clts=clts, merge_vowels=args.merge):
343 +
                    clts=clts, merge_vowels=args.merge,
344 +
                    normalization_form=args.normalize):
341 345
                out += ['\t'.join(line)]
342 346
        else:
343 347
            for line in lingpy.sequence.profile.simple_profile(wl,
348 +
                    normalization_form=args.normalize,
344 349
                    ref=args.column, clts=clts, merge_vowels=args.merge):
345 350
                out += ['\t'.join(line)]
346 351
        if args.output_file == 'stdout':
Files Coverage
src/lingpy 87.79%
tests 98.86%
Project Totals (123 files) 89.43%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading