Source code for jaclearn.nlp.tree.ptb

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : ptb.py
# Author : Jiayuan Mao
# Email  : maojiayuan@gmail.com
# Date   : 07/04/2018
#
# This file is part of Jacinle.
# Distributed under terms of the MIT license.

"""
PTB-formed constituency trees.
"""

__all__ = ['PTBNode', 'parse_ptb']

from .node import Node
from .traversal import traversal


[docs] class PTBNode(Node):
[docs] def __init__(self, vtype, token=None, index=-1): super().__init__(vtype, None) self.token = token self.index = index
@property def leftmost_index(self): return self.index if self.is_leaf else self.children[0].leftmost_index @property def rightmost_index(self): return self.index if self.is_leaf else self.children[-1].rightmost_index
[docs] @classmethod def from_string(cls, encoding, incl_vtype=True, default_vtype=None): if isinstance(encoding, str): steps = encoding.split() else: assert isinstance(encoding, (tuple, list)) steps = encoding stack = [] word_id = 0 for s in steps: if s == '(' or s.startswith('('): while s.startswith('('): stack.append('(') s = s[1:] if len(s) > 0: stack.append(s) elif s == ')' or s.endswith(')'): nr_right = 0 while s.endswith(')'): nr_right += 1 s = s[:-1] if len(s) > 0: stack.append(s) for i in range(nr_right): poped = [] while True: x = stack.pop() if isinstance(x, str) and x == '(': break poped.append(x) poped = poped[::-1] if incl_vtype: if len(poped) == 2 and isinstance(poped[1], str): # is leaf stack.append(cls(poped[0], poped[1])) else: node = cls(poped[0]) for x in poped[1:]: node.append_child(x) stack.append(node) else: if len(poped) == 1 and isinstance(poped[0], str): # is leaf stack.append(cls(default_vtype, poped[0])) else: node = cls(default_vtype) for x in poped: if isinstance(x, str): x = cls(default_vtype, x) node.append_child(x) stack.append(node) else: stack.append(s) if len(stack) != 1: raise ValueError('Invalid PTB encoding.') return stack[0]
[docs] def to_string(self, to_string=True, compressed=True, vtype=True): if not to_string: compressed = False def dfs(node): if compressed: if node.is_leaf: if vtype: yield '({} {})'.format(node.vtype, node.token) else: yield '({})'.format(node.token) else: if vtype: yield '({} '.format(node.vtype) else: yield '(' for i, x in enumerate(node.children): if i != 0: yield ' ' yield from dfs(x) yield ')' else: yield '(' if node.is_leaf: if vtype: yield node.vtype yield node.token else: if vtype: yield node.vtype for x in node.children: yield from dfs(x) yield ')' s = list(dfs(self)) if not to_string: return s if compressed: return ''.join(s) return ' '.join(s)
[docs] def to_sentence(self, to_string=True): def dfs(): for node in traversal(self, 'pre'): if node.is_leaf: yield node.token if not to_string: return list(dfs()) return ' '.join(list(dfs()))
def __str_node__(self): if self.is_leaf: return 'VType: {} Token: {}'.format(self.vtype, self.token) return 'VType: {}'.format(self.vtype)
[docs] def assign_index(self, start_index=0): if not self.is_leaf and self.token is not None: raise ValueError('Cannot assign index for trees with non-leaf tokens.') if self.is_leaf: self.index = start_index return self.index + 1 for c in self.children: start_index = c.assign_index(start_index) return start_index
[docs] def parse_ptb(sentence): return PTBNode.from_string(sentence)