Statistics
| Branch: | Tag: | Revision:

hlrc / client / python / hlrc_client / textgrid_hlrc.py @ e21d7f2c

History | View | Annotate | Download (18.409 KB)

1
# Natural Language Toolkit: TextGrid analysis
2
#
3
# Copyright (C) 2001-2011 NLTK Project
4
# Author: Margaret Mitchell <itallow@gmail.com>
5
#         Steven Bird <sb@csse.unimelb.edu.au> (revisions)
6
# URL: <http://www.nltk.org>
7
# For license information, see LICENSE.TXT
8
#
9

    
10
"""
11
Tools for reading TextGrid files, the format used by Praat.
12

13
Module contents
14
===============
15

16
The textgrid corpus reader provides 4 data items and 1 function
17
for each textgrid file.  For each tier in the file, the reader
18
provides 10 data items and 2 functions.
19
 
20
For the full textgrid file: 
21

22
  - size
23
    The number of tiers in the file.
24

25
  - xmin
26
    First marked time of the file.
27

28
  - xmax
29
    Last marked time of the file.
30

31
  - t_time
32
    xmax - xmin.
33

34
  - text_type
35
    The style of TextGrid format:
36
        - ooTextFile:  Organized by tier.
37
        - ChronTextFile:  Organized by time.
38
        - OldooTextFile:  Similar to ooTextFile.
39

40
  - to_chron()
41
    Convert given file to a ChronTextFile format.
42

43
  - to_oo()
44
    Convert given file to an ooTextFile format.
45

46
For each tier:
47

48
  - text_type
49
    The style of TextGrid format, as above.
50

51
  - classid
52
    The style of transcription on this tier:
53
        - IntervalTier:  Transcription is marked as intervals.
54
        - TextTier:  Transcription is marked as single points.
55

56
  - nameid
57
    The name of the tier.
58

59
  - xmin
60
    First marked time of the tier.
61

62
  - xmax
63
    Last marked time of the tier.
64

65
  - size
66
    Number of entries in the tier.
67

68
  - transcript
69
    The raw transcript for the tier.
70

71
  - simple_transcript
72
    The transcript formatted as a list of tuples: (time1, time2, utterance).
73

74
  - tier_info
75
    List of (classid, nameid, xmin, xmax, size, transcript).
76

77
  - min_max()
78
    A tuple of (xmin, xmax).  
79

80
  - time(non_speech_marker)
81
    Returns the utterance time of a given tier.
82
    Excludes entries that begin with a non-speech marker.
83

84
"""
85

    
86
# needs more cleanup, subclassing, epydoc docstrings
87

    
88
import sys
89
import re
90

    
91
TEXTTIER = "TextTier"
92
INTERVALTIER = "IntervalTier"
93

    
94
OOTEXTFILE = re.compile(r"""(?x)
95
            xmin\ =\ (.*)[\r\n]+
96
            xmax\ =\ (.*)[\r\n]+
97
            [\s\S]+?size\ =\ (.*)[\r\n]+ 
98
""")
99

    
100
CHRONTEXTFILE = re.compile(r"""(?x)
101
            [\r\n]+(\S+)\ 
102
            (\S+)\ +!\ Time\ domain.\ *[\r\n]+
103
            (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
104
""")
105

    
106
OLDOOTEXTFILE = re.compile(r"""(?x)
107
            [\r\n]+(\S+)
108
            [\r\n]+(\S+)
109
            [\r\n]+.+[\r\n]+(\S+)
110
""")
111

    
112

    
113

    
114
#################################################################
115
# TextGrid Class
116
#################################################################
117

    
118
class TextGrid(object):
119
    """
120
    Class to manipulate the TextGrid format used by Praat.
121
    Separates each tier within this file into its own Tier
122
    object.  Each TextGrid object has
123
    a number of tiers (size), xmin, xmax, a text type to help
124
    with the different styles of TextGrid format, and tiers with their
125
    own attributes.
126
    """
127

    
128
    def __init__(self, read_file):
129
        """
130
        Takes open read file as input, initializes attributes 
131
        of the TextGrid file.
132
        @type read_file: An open TextGrid file, mode "r".
133
        @param size:  Number of tiers.
134
        @param xmin: xmin.
135
        @param xmax: xmax.
136
        @param t_time:  Total time of TextGrid file.
137
        @param text_type:  TextGrid format.
138
        @type tiers:  A list of tier objects.
139
        """
140

    
141
        self.read_file = read_file
142
        self.size = 0
143
        self.xmin = 0
144
        self.xmax = 0
145
        self.t_time = 0
146
        self.text_type = self._check_type()
147
        self.tiers = self._find_tiers()
148

    
149
    def __iter__(self):
150
        for tier in self.tiers:
151
            yield tier
152

    
153
    def next(self):
154
        if self.idx == (self.size - 1):
155
            raise StopIteration
156
        self.idx += 1
157
        return self.tiers[self.idx]
158

    
159
    @staticmethod
160
    def load(file):
161
        """
162
        @param file: a file in TextGrid format
163
        """
164

    
165
        return TextGrid(open(file).read())
166

    
167
    def _load_tiers(self, header):
168
        """
169
        Iterates over each tier and grabs tier information.
170
        """ 
171

    
172
        tiers = []
173
        if self.text_type == "ChronTextFile":
174
            m = re.compile(header)
175
            tier_headers = m.findall(self.read_file)
176
            tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""
177
            for i in range(0, self.size):
178
                tier_info = [tier_headers[i]] + \
179
                re.findall(str(i + 1) + tier_re, self.read_file)
180
                tier_info = "\n".join(tier_info)
181
                tiers.append(Tier(tier_info, self.text_type, self.t_time))
182
            return tiers
183
        
184
        tier_re = header + "\d+[\s\S]+?(?=" + header + "|$$)"
185
        m = re.compile(tier_re)
186
        tier_iter = m.finditer(self.read_file)
187
        for iterator in tier_iter:
188
            (begin, end) = iterator.span()
189
            tier_info = self.read_file[begin:end]
190
            tiers.append(Tier(tier_info, self.text_type, self.t_time))
191
        return tiers
192
    
193
    def _check_type(self):
194
        """
195
        Figures out the TextGrid format.
196
        """
197

    
198
        m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)
199
        try:
200
            type_id = m.group(1).strip()
201
        except AttributeError:
202
            raise TypeError("Cannot read file -- try TextGrid.load()")
203
        xmin = m.group(4)
204
        if type_id == "File type = \"ooTextFile\"":
205
            if "xmin" not in xmin:
206
                xmin = m.group(3)
207
                if "xmin" not in xmin:
208
                    text_type = "OldooTextFile"
209
                else:
210
                    text_type = "ooTextFile"
211
            else:
212
                text_type = "ooTextFile"
213
        elif type_id == "\"Praat chronological TextGrid text file\"":
214
            text_type = "ChronTextFile"
215
        else: 
216
            raise TypeError("Unknown format '(%s)'", (type_id))
217
        return text_type
218
        
219
    def _find_tiers(self):
220
        """
221
        Splits the textgrid file into substrings corresponding to tiers. 
222
        """
223

    
224
        if self.text_type == "ooTextFile":
225
            m = OOTEXTFILE
226
            header = " *item \["
227
        elif self.text_type == "ChronTextFile":
228
            m = CHRONTEXTFILE
229
            header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"
230
        elif self.text_type == "OldooTextFile":
231
            m = OLDOOTEXTFILE
232
            header = "\".*\"[\r\n]+\".*\""
233

    
234
        file_info = m.findall(self.read_file)[0]
235
        self.xmin = float(file_info[0])
236
        self.xmax = float(file_info[1])
237
        self.t_time = self.xmax - self.xmin
238
        self.size = int(file_info[2])
239
        tiers = self._load_tiers(header)
240
        return tiers
241

    
242
    def to_chron(self):
243
        """ 
244
        @return:  String in Chronological TextGrid file format.
245
        """
246

    
247
        chron_file = ""
248
        chron_file += "\"Praat chronological TextGrid text file\"\n"
249
        chron_file += str(self.xmin) + " " + str(self.xmax)
250
        chron_file += "   ! Time domain.\n"
251
        chron_file += str(self.size) + "   ! Number of tiers.\n"
252
        for tier in self.tiers:
253
            idx = (self.tiers.index(tier)) + 1
254
            tier_header = "\"" + tier.classid + "\" \"" \
255
                          + tier.nameid + "\" " + str(tier.xmin) \
256
                          + " " + str(tier.xmax)
257
            chron_file += tier_header + "\n"
258
            transcript = tier.simple_transcript
259
            for (xmin, xmax, utt) in transcript:
260
                chron_file += str(idx) + " " + str(xmin) 
261
                chron_file += " " + str(xmax) +"\n"
262
                chron_file += "\"" + utt + "\"\n"
263
        return chron_file
264

    
265
    def to_oo(self):
266
        """ 
267
        @return:  A string in OoTextGrid file format.
268
        """
269
   
270
        oo_file = ""
271
        oo_file += "File type = \"ooTextFile\"\n"
272
        oo_file += "Object class = \"TextGrid\"\n\n"
273
        oo_file += "xmin = ", self.xmin, "\n"
274
        oo_file += "xmax = ", self.xmax, "\n"
275
        oo_file += "tiers? <exists>\n"
276
        oo_file += "size = ", self.size, "\n"
277
        oo_file += "item []:\n"
278
        for i in range(len(self.tiers)):
279
            oo_file += "%4s%s [%s]" % ("", "item", i + 1)
280
            _curr_tier = self.tiers[i]
281
            for (x, y) in _curr_tier.header:
282
                oo_file += "%8s%s = \"%s\"" % ("", x, y)
283
            if _curr_tier.classid != TEXTTIER:
284
                for (xmin, xmax, text) in _curr_tier.simple_transcript:
285
                    oo_file += "%12s%s = %s" % ("", "xmin", xmin)
286
                    oo_file += "%12s%s = %s" % ("", "xmax", xmax)
287
                    oo_file += "%12s%s = \"%s\"" % ("", "text", text)
288
            else:
289
                for (time, mark) in _curr_tier.simple_transcript:
290
                    oo_file += "%12s%s = %s" % ("", "time", time)
291
                    oo_file += "%12s%s = %s" % ("", "mark", mark)
292
        return oo_file
293

    
294

    
295
#################################################################
296
# Tier Class
297
#################################################################
298

    
299
class Tier(object):
300
    """ 
301
    A container for each tier.
302
    """
303

    
304
    def __init__(self, tier, text_type, t_time):
305
        """
306
        Initializes attributes of the tier: class, name, xmin, xmax
307
        size, transcript, total time.  
308
        Utilizes text_type to guide how to parse the file.
309
        @type tier: a tier object; single item in the TextGrid list.
310
        @param text_type:  TextGrid format
311
        @param t_time:  Total time of TextGrid file.
312
        @param classid:  Type of tier (point or interval).
313
        @param nameid:  Name of tier.
314
        @param xmin:  xmin of the tier.
315
        @param xmax:  xmax of the tier.
316
        @param size:  Number of entries in the tier
317
        @param transcript:  The raw transcript for the tier.
318
        """
319

    
320
        self.tier = tier
321
        self.text_type = text_type
322
        self.t_time = t_time
323
        self.classid = ""
324
        self.nameid = ""
325
        self.xmin = 0
326
        self.xmax = 0
327
        self.size = 0
328
        self.transcript = ""
329
        self.tier_info = ""
330
        self._make_info()
331
        self.simple_transcript = self.make_simple_transcript()
332
        if self.classid != TEXTTIER:
333
            self.mark_type = "intervals"
334
        else:
335
            self.mark_type = "points"
336
            self.header = [("class", self.classid), ("name", self.nameid), \
337
            ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]
338

    
339
    def __iter__(self):
340
        return self
341
  
342
    def _make_info(self):
343
        """
344
        Figures out most attributes of the tier object:
345
        class, name, xmin, xmax, transcript.
346
        """
347

    
348
        trans = "([\S\s]*)"
349
        if self.text_type == "ChronTextFile":
350
            classid = "\"(.*)\" +"
351
            nameid = "\"(.*)\" +"
352
            xmin = "(\d+\.?\d*) +"
353
            xmax = "(\d+\.?\d*) *[\r\n]+"
354
            # No size values are given in the Chronological Text File format.
355
            self.size = None
356
            size = ""
357
        elif self.text_type == "ooTextFile":
358
            classid = " *class = \"(.*)\" *[\r\n]+"
359
            nameid = " *name = \"(.*)\" *[\r\n]+"
360
            xmin = " *xmin = (\d+\.?\d*) *[\r\n]+"
361
            xmax = " *xmax = (\d+\.?\d*) *[\r\n]+"
362
            size = " *\S+: size = (\d+) *[\r\n]+"
363
        elif self.text_type == "OldooTextFile":
364
            classid = "\"(.*)\" *[\r\n]+"
365
            nameid = "\"(.*)\" *[\r\n]+"
366
            xmin = "(\d+\.?\d*) *[\r\n]+"
367
            xmax = "(\d+\.?\d*) *[\r\n]+"
368
            size = "(\d+) *[\r\n]+"
369
        m = re.compile(classid + nameid + xmin + xmax + size + trans)
370
        self.tier_info = m.findall(self.tier)[0]
371
        self.classid = self.tier_info[0]
372
        self.nameid = self.tier_info[1]
373
        self.xmin = float(self.tier_info[2])
374
        self.xmax = float(self.tier_info[3])
375
        if self.size != None:
376
            self.size = int(self.tier_info[4])
377
        self.transcript = self.tier_info[-1]
378
            
379
    def make_simple_transcript(self):
380
        """ 
381
        @return:  Transcript of the tier, in form [(start_time end_time label)]
382
        """
383

    
384
        if self.text_type == "ChronTextFile":
385
            trans_head = ""
386
            trans_xmin = " (\S+)"
387
            trans_xmax = " (\S+)[\r\n]+"
388
            trans_text = "\"([\S\s]*?)\""
389
        elif self.text_type == "ooTextFile":
390
            trans_head = " *\S+ \[\d+\]: *[\r\n]+"
391
            trans_xmin = " *\S+ = (\S+) *[\r\n]+"
392
            trans_xmax = " *\S+ = (\S+) *[\r\n]+"
393
            trans_text = " *\S+ = \"([^\"]*?)\""    
394
        elif self.text_type == "OldooTextFile":
395
            trans_head = ""
396
            trans_xmin = "(.*)[\r\n]+"
397
            trans_xmax = "(.*)[\r\n]+"
398
            trans_text = "\"([\S\s]*?)\""
399
        if self.classid == TEXTTIER:
400
            trans_xmin = ""
401
        trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)
402
        self.simple_transcript = trans_m.findall(self.transcript)
403
        return self.simple_transcript
404

    
405
    def transcript(self):
406
        """
407
        @return:  Transcript of the tier, as it appears in the file.
408
        """
409
       
410
        return self.transcript
411

    
412
    def time(self, non_speech_char="."):
413
        """
414
        @return: Utterance time of a given tier.
415
        Screens out entries that begin with a non-speech marker.        
416
        """
417

    
418
        total = 0.0
419
        if self.classid != TEXTTIER:
420
            for (time1, time2, utt) in self.simple_transcript:
421
                utt = utt.strip()
422
                if utt and not utt[0] == ".":
423
                    total += (float(time2) - float(time1))
424
        return total
425
                    
426
    def tier_name(self):
427
        """
428
        @return:  Tier name of a given tier.
429
        """
430

    
431
        return self.nameid
432

    
433
    def classid(self):
434
        """
435
        @return:  Type of transcription on tier.
436
        """
437

    
438
        return self.classid
439

    
440
    def min_max(self):
441
        """
442
        @return:  (xmin, xmax) tuple for a given tier.
443
        """
444

    
445
        return (self.xmin, self.xmax)
446

    
447
    def __repr__(self):
448
        return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)
449

    
450
    def __str__(self):
451
        return self.__repr__() + "\n  " + "\n  ".join(" ".join(row) for row in self.simple_transcript)
452

    
453
def demo_TextGrid(demo_data):
454
    print "** Demo of the TextGrid class. **"
455

    
456
    fid = TextGrid(demo_data)
457
    print "Tiers:", fid.size
458

    
459
    for i, tier in enumerate(fid):
460
        print "\n***"
461
        print "Tier:", i + 1
462
        print tier
463

    
464
def demo():
465
    # Each demo demonstrates different TextGrid formats.
466
    print "Format 1"
467
    demo_TextGrid(demo_data1)
468
    print "\nFormat 2"
469
    demo_TextGrid(demo_data2)
470
    print "\nFormat 3"
471
    demo_TextGrid(demo_data3)
472

    
473

    
474
demo_data1 = """File type = "ooTextFile"
475
Object class = "TextGrid"
476

477
xmin = 0 
478
xmax = 2045.144149659864
479
tiers? <exists> 
480
size = 3 
481
item []: 
482
    item [1]:
483
        class = "IntervalTier" 
484
        name = "utterances" 
485
        xmin = 0 
486
        xmax = 2045.144149659864 
487
        intervals: size = 5 
488
        intervals [1]:
489
            xmin = 0 
490
            xmax = 2041.4217474125382 
491
            text = "" 
492
        intervals [2]:
493
            xmin = 2041.4217474125382 
494
            xmax = 2041.968276643991 
495
            text = "this" 
496
        intervals [3]:
497
            xmin = 2041.968276643991 
498
            xmax = 2042.5281632653062 
499
            text = "is" 
500
        intervals [4]:
501
            xmin = 2042.5281632653062 
502
            xmax = 2044.0487352585324 
503
            text = "a" 
504
        intervals [5]:
505
            xmin = 2044.0487352585324 
506
            xmax = 2045.144149659864 
507
            text = "demo" 
508
    item [2]:
509
        class = "TextTier" 
510
        name = "notes" 
511
        xmin = 0 
512
        xmax = 2045.144149659864 
513
        points: size = 3 
514
        points [1]:
515
            time = 2041.4217474125382 
516
            mark = ".begin_demo"
517
        points [2]:
518
            time = 2043.8338291031832
519
            mark = "voice gets quiet here" 
520
        points [3]:
521
            time = 2045.144149659864
522
            mark = ".end_demo" 
523
    item [3]:
524
        class = "IntervalTier" 
525
        name = "phones" 
526
        xmin = 0 
527
        xmax = 2045.144149659864
528
        intervals: size = 12
529
        intervals [1]:
530
            xmin = 0 
531
            xmax = 2041.4217474125382 
532
            text = "" 
533
        intervals [2]:
534
            xmin = 2041.4217474125382 
535
            xmax = 2041.5438290324326 
536
            text = "D"
537
        intervals [3]:
538
            xmin = 2041.5438290324326
539
            xmax = 2041.7321032910372
540
            text = "I"
541
        intervals [4]:
542
            xmin = 2041.7321032910372            
543
            xmax = 2041.968276643991 
544
            text = "s" 
545
        intervals [5]:
546
            xmin = 2041.968276643991 
547
            xmax = 2042.232189031843
548
            text = "I"
549
        intervals [6]:
550
            xmin = 2042.232189031843
551
            xmax = 2042.5281632653062 
552
            text = "z" 
553
        intervals [7]:
554
            xmin = 2042.5281632653062 
555
            xmax = 2044.0487352585324 
556
            text = "eI" 
557
        intervals [8]:
558
            xmin = 2044.0487352585324 
559
            xmax = 2044.2487352585324
560
            text = "dc"
561
        intervals [9]:
562
            xmin = 2044.2487352585324
563
            xmax = 2044.3102321849011
564
            text = "d"
565
        intervals [10]:
566
            xmin = 2044.3102321849011
567
            xmax = 2044.5748932104329
568
            text = "E"
569
        intervals [11]:
570
            xmin = 2044.5748932104329
571
            xmax = 2044.8329108578437
572
            text = "m"
573
        intervals [12]:
574
            xmin = 2044.8329108578437
575
            xmax = 2045.144149659864 
576
            text = "oU" 
577
"""
578

    
579
demo_data2 = """File type = "ooTextFile"
580
Object class = "TextGrid"
581

582
0
583
2.8
584
<exists>
585
2
586
"IntervalTier"
587
"utterances"
588
0
589
2.8
590
3
591
0
592
1.6229213249309031
593
""
594
1.6229213249309031
595
2.341428074708195
596
"demo"
597
2.341428074708195
598
2.8
599
""
600
"IntervalTier"
601
"phones"
602
0
603
2.8
604
6
605
0
606
1.6229213249309031
607
""
608
1.6229213249309031
609
1.6428291382019483
610
"dc"
611
1.6428291382019483
612
1.65372183721983721
613
"d"
614
1.65372183721983721
615
1.94372874328943728
616
"E"
617
1.94372874328943728
618
2.13821938291038210
619
"m"
620
2.13821938291038210
621
2.341428074708195
622
"oU"
623
2.341428074708195
624
2.8
625
""
626
"""
627

    
628
demo_data3 = """"Praat chronological TextGrid text file"
629
0 2.8   ! Time domain.
630
2   ! Number of tiers.
631
"IntervalTier" "utterances" 0 2.8
632
"IntervalTier" "utterances" 0 2.8
633
1 0 1.6229213249309031
634
""
635
2 0 1.6229213249309031
636
""
637
2 1.6229213249309031 1.6428291382019483
638
"dc"
639
2 1.6428291382019483 1.65372183721983721
640
"d"
641
2 1.65372183721983721 1.94372874328943728
642
"E"
643
2 1.94372874328943728 2.13821938291038210
644
"m"
645
2 2.13821938291038210 2.341428074708195
646
"oU"
647
1 1.6229213249309031 2.341428074708195
648
"demo"
649
1 2.341428074708195 2.8
650
""
651
2 2.341428074708195 2.8
652
""
653
"""
654

    
655
if __name__ == "__main__":
656
    demo()
657