/client/python/hlrc_client/textgrid_hlrc.py - Annotate - hlrc - Research for Cognitive Interaction

0c286af0

Simon Schulz

# Natural Language Toolkit: TextGrid analysis

2

3

# Copyright (C) 2001-2011 NLTK Project

4

# Author: Margaret Mitchell <itallow@gmail.com>

5

#         Steven Bird <sb@csse.unimelb.edu.au> (revisions)

6

# URL: <http://www.nltk.org>

7

# For license information, see LICENSE.TXT

8

9

10

"""

11

Tools for reading TextGrid files, the format used by Praat.

12

13

Module contents

14

===============

15

16

The textgrid corpus reader provides 4 data items and 1 function

17

for each textgrid file.  For each tier in the file, the reader

18

provides 10 data items and 2 functions.

19

20

For the full textgrid file:

21

22

  - size

23

    The number of tiers in the file.

24

25

  - xmin

26

    First marked time of the file.

27

28

  - xmax

29

    Last marked time of the file.

30

31

  - t_time

32

    xmax - xmin.

33

34

  - text_type

35

    The style of TextGrid format:

36

        - ooTextFile:  Organized by tier.

37

        - ChronTextFile:  Organized by time.

38

        - OldooTextFile:  Similar to ooTextFile.

39

40

  - to_chron()

41

    Convert given file to a ChronTextFile format.

42

43

  - to_oo()

44

    Convert given file to an ooTextFile format.

45

46

For each tier:

47

48

  - text_type

49

    The style of TextGrid format, as above.

50

51

  - classid

52

    The style of transcription on this tier:

53

        - IntervalTier:  Transcription is marked as intervals.

54

        - TextTier:  Transcription is marked as single points.

55

56

  - nameid

57

    The name of the tier.

58

59

  - xmin

60

    First marked time of the tier.

61

62

  - xmax

63

    Last marked time of the tier.

64

65

  - size

66

    Number of entries in the tier.

67

68

  - transcript

69

    The raw transcript for the tier.

70

71

  - simple_transcript

72

    The transcript formatted as a list of tuples: (time1, time2, utterance).

73

74

  - tier_info

75

    List of (classid, nameid, xmin, xmax, size, transcript).

76

77

  - min_max()

78

    A tuple of (xmin, xmax).

79

80

  - time(non_speech_marker)

81

    Returns the utterance time of a given tier.

82

    Excludes entries that begin with a non-speech marker.

83

84

"""

85

86

# needs more cleanup, subclassing, epydoc docstrings

87

88

import sys

89

import re

90

91

TEXTTIER = "TextTier"

92

INTERVALTIER = "IntervalTier"

93

94

OOTEXTFILE = re.compile(r"""(?x)

95

            xmin\ =\ (.*)[\r\n]+

96

            xmax\ =\ (.*)[\r\n]+

97

            [\s\S]+?size\ =\ (.*)[\r\n]+

98

""")

99

100

CHRONTEXTFILE = re.compile(r"""(?x)

101

            [\r\n]+(\S+)\

102

            (\S+)\ +!\ Time\ domain.\ *[\r\n]+

103

            (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"

104

""")

105

106

OLDOOTEXTFILE = re.compile(r"""(?x)

107

            [\r\n]+(\S+)

108

            [\r\n]+(\S+)

109

            [\r\n]+.+[\r\n]+(\S+)

110

""")

#################################################################

115

# TextGrid Class

116

#################################################################

117

118

class TextGrid(object):

119

"""

120

    Class to manipulate the TextGrid format used by Praat.

121

    Separates each tier within this file into its own Tier

122

    object.  Each TextGrid object has

123

    a number of tiers (size), xmin, xmax, a text type to help

124

    with the different styles of TextGrid format, and tiers with their

125

    own attributes.

126

"""

127

128

    def __init__(self, read_file):

129

"""

130

        Takes open read file as input, initializes attributes

131

        of the TextGrid file.

132

        @type read_file: An open TextGrid file, mode "r".

133

        @param size:  Number of tiers.

134

        @param xmin: xmin.

135

        @param xmax: xmax.

136

        @param t_time:  Total time of TextGrid file.

137

        @param text_type:  TextGrid format.

138

        @type tiers:  A list of tier objects.

139

"""

140

141

        self.read_file = read_file

142

        self.size = 0

143

        self.xmin = 0

144

        self.xmax = 0

145

        self.t_time = 0

146

        self.text_type = self._check_type()

147

        self.tiers = self._find_tiers()

148

149

    def __iter__(self):

150

        for tier in self.tiers:

151

            yield tier

152

153

    def next(self):

154

        if self.idx == (self.size - 1):

155

            raise StopIteration

156

        self.idx += 1

157

        return self.tiers[self.idx]

158

159

    @staticmethod

160

    def load(file):

161

"""

162

        @param file: a file in TextGrid format

163

"""

164

165

        return TextGrid(open(file).read())

166

167

    def _load_tiers(self, header):

168

"""

169

        Iterates over each tier and grabs tier information.

170

"""

171

172

        tiers = []

173

        if self.text_type == "ChronTextFile":

174

            m = re.compile(header)

175

            tier_headers = m.findall(self.read_file)

176

            tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""

177

            for i in range(0, self.size):

178

                tier_info = [tier_headers[i]] + \

179

                re.findall(str(i + 1) + tier_re, self.read_file)

180

                tier_info = "\n".join(tier_info)

181

                tiers.append(Tier(tier_info, self.text_type, self.t_time))

182

            return tiers

183

184

        tier_re = header + "\d+[\s\S]+?(?=" + header + "|$$)"

185

        m = re.compile(tier_re)

186

        tier_iter = m.finditer(self.read_file)

187

        for iterator in tier_iter:

188

            (begin, end) = iterator.span()

189

            tier_info = self.read_file[begin:end]

190

            tiers.append(Tier(tier_info, self.text_type, self.t_time))

191

        return tiers

192

193

    def _check_type(self):

194

"""

195

        Figures out the TextGrid format.

196

"""

197

198

        m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)

199

        try:

200

            type_id = m.group(1).strip()

201

        except AttributeError:

202

            raise TypeError("Cannot read file -- try TextGrid.load()")

203

        xmin = m.group(4)

204

        if type_id == "File type = \"ooTextFile\"":

205

            if "xmin" not in xmin:

206

                xmin = m.group(3)

207

                if "xmin" not in xmin:

208

                    text_type = "OldooTextFile"

209

                else:

210

                    text_type = "ooTextFile"

211

            else:

212

                text_type = "ooTextFile"

213

        elif type_id == "\"Praat chronological TextGrid text file\"":

214

            text_type = "ChronTextFile"

215

        else:

216

            raise TypeError("Unknown format '(%s)'", (type_id))

217

        return text_type

218

219

    def _find_tiers(self):

220

"""

221

        Splits the textgrid file into substrings corresponding to tiers.

222

"""

223

224

        if self.text_type == "ooTextFile":

225

            m = OOTEXTFILE

226

            header = " *item \["

227

        elif self.text_type == "ChronTextFile":

228

            m = CHRONTEXTFILE

229

            header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"

230

        elif self.text_type == "OldooTextFile":

231

            m = OLDOOTEXTFILE

232

            header = "\".*\"[\r\n]+\".*\""

233

234

        file_info = m.findall(self.read_file)[0]

235

        self.xmin = float(file_info[0])

236

        self.xmax = float(file_info[1])

237

        self.t_time = self.xmax - self.xmin

238

        self.size = int(file_info[2])

239

        tiers = self._load_tiers(header)

240

        return tiers

241

242

    def to_chron(self):

243

"""

244

        @return:  String in Chronological TextGrid file format.

245

"""

246

247

        chron_file = ""

248

        chron_file += "\"Praat chronological TextGrid text file\"\n"

249

        chron_file += str(self.xmin) + " " + str(self.xmax)

250

        chron_file += "   ! Time domain.\n"

251

        chron_file += str(self.size) + "   ! Number of tiers.\n"

252

        for tier in self.tiers:

253

            idx = (self.tiers.index(tier)) + 1

254

            tier_header = "\"" + tier.classid + "\" \"" \

255

                          + tier.nameid + "\" " + str(tier.xmin) \

256

                          + " " + str(tier.xmax)

257

            chron_file += tier_header + "\n"

258

            transcript = tier.simple_transcript

259

            for (xmin, xmax, utt) in transcript:

260

                chron_file += str(idx) + " " + str(xmin)

261

                chron_file += " " + str(xmax) +"\n"

262

                chron_file += "\"" + utt + "\"\n"

263

        return chron_file

264

265

    def to_oo(self):

266

"""

267

        @return:  A string in OoTextGrid file format.

268

"""

269

270

        oo_file = ""

271

        oo_file += "File type = \"ooTextFile\"\n"

272

        oo_file += "Object class = \"TextGrid\"\n\n"

273

        oo_file += "xmin = ", self.xmin, "\n"

274

        oo_file += "xmax = ", self.xmax, "\n"

275

        oo_file += "tiers? <exists>\n"

276

        oo_file += "size = ", self.size, "\n"

277

        oo_file += "item []:\n"

278

        for i in range(len(self.tiers)):

279

            oo_file += "%4s%s [%s]" % ("", "item", i + 1)

280

            _curr_tier = self.tiers[i]

281

            for (x, y) in _curr_tier.header:

282

                oo_file += "%8s%s = \"%s\"" % ("", x, y)

283

            if _curr_tier.classid != TEXTTIER:

284

                for (xmin, xmax, text) in _curr_tier.simple_transcript:

285

                    oo_file += "%12s%s = %s" % ("", "xmin", xmin)

286

                    oo_file += "%12s%s = %s" % ("", "xmax", xmax)

287

                    oo_file += "%12s%s = \"%s\"" % ("", "text", text)

288

            else:

289

                for (time, mark) in _curr_tier.simple_transcript:

290

                    oo_file += "%12s%s = %s" % ("", "time", time)

291

                    oo_file += "%12s%s = %s" % ("", "mark", mark)

292

        return oo_file

293

294

295

#################################################################

296

# Tier Class

297

#################################################################

298

299

class Tier(object):

300

"""

301

    A container for each tier.

302

"""

303

304

    def __init__(self, tier, text_type, t_time):

305

"""

306

        Initializes attributes of the tier: class, name, xmin, xmax

307

        size, transcript, total time.

308

        Utilizes text_type to guide how to parse the file.

309

        @type tier: a tier object; single item in the TextGrid list.

310

        @param text_type:  TextGrid format

311

        @param t_time:  Total time of TextGrid file.

312

        @param classid:  Type of tier (point or interval).

313

        @param nameid:  Name of tier.

314

        @param xmin:  xmin of the tier.

315

        @param xmax:  xmax of the tier.

316

        @param size:  Number of entries in the tier

317

        @param transcript:  The raw transcript for the tier.

318

"""

319

320

        self.tier = tier

321

        self.text_type = text_type

322

        self.t_time = t_time

323

        self.classid = ""

324

        self.nameid = ""

325

        self.xmin = 0

326

        self.xmax = 0

327

        self.size = 0

328

        self.transcript = ""

329

        self.tier_info = ""

330

        self._make_info()

331

        self.simple_transcript = self.make_simple_transcript()

332

        if self.classid != TEXTTIER:

333

            self.mark_type = "intervals"

334

        else:

335

            self.mark_type = "points"

336

            self.header = [("class", self.classid), ("name", self.nameid), \

337

            ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]

338

339

    def __iter__(self):

340

        return self

341

342

    def _make_info(self):

343

"""

344

        Figures out most attributes of the tier object:

345

        class, name, xmin, xmax, transcript.

346

"""

347

348

        trans = "([\S\s]*)"

349

        if self.text_type == "ChronTextFile":

350

            classid = "\"(.*)\" +"

351

            nameid = "\"(.*)\" +"

352

            xmin = "(\d+\.?\d*) +"

353

            xmax = "(\d+\.?\d*) *[\r\n]+"

354

            # No size values are given in the Chronological Text File format.

355

            self.size = None

356

            size = ""

357

        elif self.text_type == "ooTextFile":

358

            classid = " *class = \"(.*)\" *[\r\n]+"

359

            nameid = " *name = \"(.*)\" *[\r\n]+"

360

            xmin = " *xmin = (\d+\.?\d*) *[\r\n]+"

361

            xmax = " *xmax = (\d+\.?\d*) *[\r\n]+"

362

            size = " *\S+: size = (\d+) *[\r\n]+"

363

        elif self.text_type == "OldooTextFile":

364

            classid = "\"(.*)\" *[\r\n]+"

365

            nameid = "\"(.*)\" *[\r\n]+"

366

            xmin = "(\d+\.?\d*) *[\r\n]+"

367

            xmax = "(\d+\.?\d*) *[\r\n]+"

368

            size = "(\d+) *[\r\n]+"

369

        m = re.compile(classid + nameid + xmin + xmax + size + trans)

370

        self.tier_info = m.findall(self.tier)[0]

371

        self.classid = self.tier_info[0]

372

        self.nameid = self.tier_info[1]

373

        self.xmin = float(self.tier_info[2])

374

        self.xmax = float(self.tier_info[3])

375

        if self.size != None:

376

            self.size = int(self.tier_info[4])

377

        self.transcript = self.tier_info[-1]

378

379

    def make_simple_transcript(self):

380

"""

381

        @return:  Transcript of the tier, in form [(start_time end_time label)]

382

"""

383

384

        if self.text_type == "ChronTextFile":

385

            trans_head = ""

386

            trans_xmin = " (\S+)"

387

            trans_xmax = " (\S+)[\r\n]+"

388

            trans_text = "\"([\S\s]*?)\""

389

        elif self.text_type == "ooTextFile":

390

            trans_head = " *\S+ \[\d+\]: *[\r\n]+"

391

            trans_xmin = " *\S+ = (\S+) *[\r\n]+"

392

            trans_xmax = " *\S+ = (\S+) *[\r\n]+"

393

            trans_text = " *\S+ = \"([^\"]*?)\""

394

        elif self.text_type == "OldooTextFile":

395

            trans_head = ""

396

            trans_xmin = "(.*)[\r\n]+"

397

            trans_xmax = "(.*)[\r\n]+"

398

            trans_text = "\"([\S\s]*?)\""

399

        if self.classid == TEXTTIER:

400

            trans_xmin = ""

401

        trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)

402

        self.simple_transcript = trans_m.findall(self.transcript)

403

        return self.simple_transcript

404

405

    def transcript(self):

406

"""

407

        @return:  Transcript of the tier, as it appears in the file.

408

"""

409

410

        return self.transcript

411

412

    def time(self, non_speech_char="."):

413

"""

414

        @return: Utterance time of a given tier.

415

        Screens out entries that begin with a non-speech marker.

416

"""

417

418

        total = 0.0

419

        if self.classid != TEXTTIER:

420

            for (time1, time2, utt) in self.simple_transcript:

421

                utt = utt.strip()

422

                if utt and not utt[0] == ".":

423

                    total += (float(time2) - float(time1))

424

        return total

425

426

    def tier_name(self):

427

"""

428

        @return:  Tier name of a given tier.

429

"""

430

431

        return self.nameid

432

433

    def classid(self):

434

"""

435

        @return:  Type of transcription on tier.

436

"""

437

438

        return self.classid

439

440

    def min_max(self):

441

"""

442

        @return:  (xmin, xmax) tuple for a given tier.

443

"""

444

445

        return (self.xmin, self.xmax)

446

447

    def __repr__(self):

448

        return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)

449

450

    def __str__(self):

451

        return self.__repr__() + "\n  " + "\n  ".join(" ".join(row) for row in self.simple_transcript)

452

453

def demo_TextGrid(demo_data):

454

    print "** Demo of the TextGrid class. **"

455

456

    fid = TextGrid(demo_data)

457

    print "Tiers:", fid.size

458

459

    for i, tier in enumerate(fid):

460

        print "\n***"

461

        print "Tier:", i + 1

462

        print tier

463

464

def demo():

465

    # Each demo demonstrates different TextGrid formats.

466

    print "Format 1"

467

    demo_TextGrid(demo_data1)

468

    print "\nFormat 2"

469

    demo_TextGrid(demo_data2)

470

    print "\nFormat 3"

471

    demo_TextGrid(demo_data3)

472

473

474

demo_data1 = """File type = "ooTextFile"

475

Object class = "TextGrid"

476

477

xmin = 0

478

xmax = 2045.144149659864

479

tiers? <exists>

480

size = 3

481

item []:

482

    item [1]:

483

        class = "IntervalTier"

484

        name = "utterances"

485

        xmin = 0

486

        xmax = 2045.144149659864

487

        intervals: size = 5

488

        intervals [1]:

489

            xmin = 0

490

            xmax = 2041.4217474125382

491

            text = ""

492

        intervals [2]:

493

            xmin = 2041.4217474125382

494

            xmax = 2041.968276643991

495

            text = "this"

496

        intervals [3]:

497

            xmin = 2041.968276643991

498

            xmax = 2042.5281632653062

499

            text = "is"

500

        intervals [4]:

501

            xmin = 2042.5281632653062

502

            xmax = 2044.0487352585324

503

            text = "a"

504

        intervals [5]:

505

            xmin = 2044.0487352585324

506

            xmax = 2045.144149659864

507

            text = "demo"

508

    item [2]:

509

        class = "TextTier"

510

        name = "notes"

511

        xmin = 0

512

        xmax = 2045.144149659864

513

        points: size = 3

514

        points [1]:

515

            time = 2041.4217474125382

516

            mark = ".begin_demo"

517

        points [2]:

518

            time = 2043.8338291031832

519

            mark = "voice gets quiet here"

520

        points [3]:

521

            time = 2045.144149659864

522

            mark = ".end_demo"

523

    item [3]:

524

        class = "IntervalTier"

525

        name = "phones"

526

        xmin = 0

527

        xmax = 2045.144149659864

528

        intervals: size = 12

529

        intervals [1]:

530

            xmin = 0

531

            xmax = 2041.4217474125382

532

            text = ""

533

        intervals [2]:

534

            xmin = 2041.4217474125382

535

            xmax = 2041.5438290324326

536

            text = "D"

537

        intervals [3]:

538

            xmin = 2041.5438290324326

539

            xmax = 2041.7321032910372

540

            text = "I"

541

        intervals [4]:

542

            xmin = 2041.7321032910372

543

            xmax = 2041.968276643991

544

            text = "s"

545

        intervals [5]:

546

            xmin = 2041.968276643991

547

            xmax = 2042.232189031843

548

            text = "I"

549

        intervals [6]:

550

            xmin = 2042.232189031843

551

            xmax = 2042.5281632653062

552

            text = "z"

553

        intervals [7]:

554

            xmin = 2042.5281632653062

555

            xmax = 2044.0487352585324

556

            text = "eI"

557

        intervals [8]:

558

            xmin = 2044.0487352585324

559

            xmax = 2044.2487352585324

560

            text = "dc"

561

        intervals [9]:

562

            xmin = 2044.2487352585324

563

            xmax = 2044.3102321849011

564

            text = "d"

565

        intervals [10]:

566

            xmin = 2044.3102321849011

567

            xmax = 2044.5748932104329

568

            text = "E"

569

        intervals [11]:

570

            xmin = 2044.5748932104329

571

            xmax = 2044.8329108578437

572

            text = "m"

573

        intervals [12]:

574

            xmin = 2044.8329108578437

575

            xmax = 2045.144149659864

576

            text = "oU"

577

"""

578

579

demo_data2 = """File type = "ooTextFile"

580

Object class = "TextGrid"

581

582

583

2.8

584

<exists>

585

586

"IntervalTier"

587

"utterances"

588

589

2.8

590

591

592

1.6229213249309031

593

""

594

1.6229213249309031

595

2.341428074708195

596

"demo"

597

2.341428074708195

598

2.8

599

""

600

"IntervalTier"

601

"phones"

602

603

2.8

604

605

606

1.6229213249309031

607

""

608

1.6229213249309031

609

1.6428291382019483

610

"dc"

611

1.6428291382019483

612

1.65372183721983721

613

"d"

614

1.65372183721983721

615

1.94372874328943728

616

"E"

617

1.94372874328943728

618

2.13821938291038210

619

"m"

620

2.13821938291038210

621

2.341428074708195

622

"oU"

623

2.341428074708195

624

2.8

625

""

626

"""

627

628

demo_data3 = """"Praat chronological TextGrid text file"

629

0 2.8   ! Time domain.

630

2   ! Number of tiers.

631

"IntervalTier" "utterances" 0 2.8

632

"IntervalTier" "utterances" 0 2.8

633

1 0 1.6229213249309031

634

""

635

2 0 1.6229213249309031

636

""

637

2 1.6229213249309031 1.6428291382019483

638

"dc"

639

2 1.6428291382019483 1.65372183721983721

640

"d"

641

2 1.65372183721983721 1.94372874328943728

642

"E"

643

2 1.94372874328943728 2.13821938291038210

644

"m"

645

2 2.13821938291038210 2.341428074708195

646

"oU"

647

1 1.6229213249309031 2.341428074708195

648

"demo"

649

1 2.341428074708195 2.8

650

""

651

2 2.341428074708195 2.8

652

""

653

"""

654

655

if __name__ == "__main__":

656

    demo()

hlrc

hlrc / client / python / hlrc_client / textgrid_hlrc.py @ 10e164da