Source code for ebu_tt_live.node.deduplicator

from .base import AbstractCombinedNode
from ebu_tt_live.documents import EBUTT3DocumentSequence, EBUTT3Document
from ebu_tt_live.bindings.pyxb_utils import RecursiveOperation, StopBranchIteration
from ebu_tt_live.strings import DOC_RECEIVED
from ebu_tt_live.errors import SequenceNumberCollisionError, UnexpectedSequenceIdentifierError
from pyxb.binding.basis import ElementContent, complexTypeDefinition
from pyxb import BIND
from pyxb.namespace import ExpandedName
from ebu_tt_live import bindings
import logging


log = logging.getLogger(__name__)
document_logger = logging.getLogger('document_logger')

[docs]class DeDuplicatorNode(AbstractCombinedNode): """ The DeDuplicator Node addresses the issue raised, whereby after ReSequencing duplication of style and region elements and attributes occurs. """ _sequence_identifier = None _expects = EBUTT3Document _provides = EBUTT3Document def __init__(self, node_id, sequence_identifier, consumer_carriage=None, \ producer_carriage=None): super(DeDuplicatorNode, self).__init__( node_id=node_id, consumer_carriage=consumer_carriage, producer_carriage=producer_carriage ) self._sequence_identifier = sequence_identifier
[docs] def process_document(self, document, **kwargs): if self.is_document(document): if document.sequence_identifier == self._sequence_identifier: raise UnexpectedSequenceIdentifierError() if self.check_if_document_seen(document=document): self.limit_sequence_to_one(document) document.sequence_identifier = self._sequence_identifier self.remove_duplication(document=document) document.validate() self.producer_carriage.emit_data(data=document, **kwargs)
[docs] def remove_duplication(self, document): old_id_dict = dict({}) new_id_dict = dict({}) hash_dict = dict({}) if document.binding.head.styling is not None: styles = document.binding.head.styling.style print styles document.binding.head.styling.style = None self.CollateUniqueVals(styles, old_id_dict, new_id_dict, hash_dict) self.AppendNewElements(styles, document.binding.head.styling.style, \ old_id_dict, new_id_dict, hash_dict) if document.binding.head.layout is not None: regions = document.binding.head.layout.region document.binding.head.layout.region = None self.CollateUniqueVals(regions, old_id_dict, new_id_dict, hash_dict) self.AppendNewElements(regions, document.binding.head.layout.region, \ old_id_dict, new_id_dict, hash_dict) replace_id_refs = ReplaceStylesAndRegions(document.binding, \ old_id_dict, \ new_id_dict) replace_id_refs.proceed()
[docs] def CollateUniqueVals(self, element_list, old_id_dict, new_id_dict, \ hash_dict): """ Creates a `dict()` of all unique style/region names """ for value in element_list: #deduplicating in-line style attributes if value.style is not None: for old_id_index in range(len(value.style)): old_id_ref = old_id_dict.get(value.style[old_id_index]) new_id_ref = new_id_dict.get(old_id_ref) value.style[old_id_index] = new_id_ref #deduplicating elements if value is not None: unique_val = ComparableElement(value) # stores references of original <xml:id> to <my_hash> old_id_dict[value.id] = unique_val.my_hash # stores references of <my_hash> to element hash_dict[unique_val.my_hash] = value
[docs] def AppendNewElements(self, element_list, element_to_append_to, old_id_dict, \ new_id_dict, hash_dict): """ Replaces starting style and region elements with the unique ones identified in CollateUniqueVals """ for hash_val, new_id in hash_dict.iteritems(): for old_element in element_list: if old_element.id is new_id.id: element_to_append_to.append(new_id) new_id_dict[hash_val] = new_id.id
[docs]def ReplaceNone(attr_value): """ If an attribute has no value, it is given non-legal character as a value, to prevent 'collisions' """ if attr_value is None: return "|" # '|' is a non-legal character and this is used to prevent # collisions between similar attributes else: return attr_value
[docs]class ComparableElement: """ Takes all the attributes of an element and returns a hash value """ def __init__(self, value): self.value = value attributeDict = value._AttributeMap.copy() xml_id_attr = ExpandedName('http://www.w3.org/XML/1998/namespace', 'id') attributeDict.pop(xml_id_attr) # This shouldn't throw an error, but if it does, something's wrong # sorted to make sure that for two elements with the same set of # attributes the values are put into the hash string in the same order sortedDict = sorted(attributeDict.items(), key=lambda t: t[0]) concatenatedStyleString = u'' for key,val in sortedDict: styleValue = ReplaceNone(val.value(value)) concatenatedStyleString += str(styleValue) + '%' for key,val in value.wildcardAttributeMap().items(): namespace = ReplaceNone(key.namespaceURI()) localName = key.localName() wildcardValue = ReplaceNone(val) concatenatedStyleString += namespace + '%' + localName + '%' + wildcardValue self.my_hash = hash(concatenatedStyleString) def __eq__(self, other): return other and self.my_hash == other.my_hash def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return self.my_hash
[docs]class ReplaceStylesAndRegions(RecursiveOperation): old_id_dict = None new_id_dict = None def __init__(self, root_element, old_id_dict, new_id_dict): super(ReplaceStylesAndRegions, self).__init__( root_element ) self.old_id_dict = old_id_dict self.new_id_dict = new_id_dict def _is_begin_timed(self, value): pass def _before_element(self, value, element=None, parent_binding=None, **kwargs): pass def _after_element(self, value, element=None, parent_binding=None, **kwargs): pass def _process_element(self, value, element=None, parent_binding=None, **kwargs): """ Replaces the style and region attributes in the rest of """ # The latter part of this and the next test is to check that the instance # is not a styling or layout element as these can't have style attributes # but their style elements present themselves in exactly the same way as # style attributes on other elements, so we have to avoid getting confused by them if hasattr(value, 'style') and value.style is not None and not \ isinstance(value, bindings.styling): id_to_index_dict = dict() # Stepping backwards to preserve hierarchy of style attributes for old_id_index in range(len(value.style)-1, -1, -1): old_id_ref = self.old_id_dict.get(value.style[old_id_index]) new_id_ref = self.new_id_dict.get(old_id_ref) # Next two lines remove in-line style duplication if new_id_ref in id_to_index_dict: del value.style[id_to_index_dict[new_id_ref]] id_to_index_dict[new_id_ref] = old_id_index value.style[old_id_index] = new_id_ref else: pass if hasattr(value, 'region') and value.region is not None and not \ isinstance(value, bindings.layout): old_id_ref = self.old_id_dict.get(value.region) new_id_ref = self.new_id_dict.get(old_id_ref) value.region = new_id_ref else: pass def _process_non_element(self, value, non_element, parent_binding=None, **kwargs): pass