Source code for subliminal.subtitle

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import logging
import os

import chardet
from guessit.matchtree import MatchTree
from guessit.plugins.transformers import get_transformer
import pysrt

from .video import Episode, Movie

logger = logging.getLogger(__name__)


[docs]class Subtitle(object): """Base class for subtitle. :param language: language of the subtitle. :type language: :class:`~babelfish.language.Language` :param bool hearing_impaired: whether or not the subtitle is hearing impaired. :param page_link: URL of the web page from which the subtitle can be downloaded. :type page_link: str """ #: Name of the provider that returns that class of subtitle provider_name = '' def __init__(self, language, hearing_impaired=False, page_link=None): #: Language of the subtitle self.language = language #: Whether or not the subtitle is hearing impaired self.hearing_impaired = hearing_impaired #: URL of the web page from which the subtitle can be downloaded self.page_link = page_link #: Content as bytes self.content = None #: Encoding to decode with when accessing :attr:`text` self.encoding = None @property def id(self): """Unique identifier of the subtitle.""" raise NotImplementedError @property def text(self): """Content as string. If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding` """ if not self.content: return return self.content.decode(self.encoding or self.guess_encoding(), errors='replace')
[docs] def is_valid(self): """Check if a :attr:`text` is a valid SubRip format. :return: whether or not the subtitle is valid. :rtype: bool """ if not self.text: return False try: pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE) except pysrt.Error as e: if e.args[0] < 80: return False return True
[docs] def guess_encoding(self): """Guess encoding using the language, falling back on chardet. :return: the guessed encoding. :rtype: str """ logger.info('Guessing encoding for language %s', self.language) # always try utf-8 first encodings = ['utf-8'] # add language-specific encodings if self.language.alpha3 == 'zho': encodings.extend(['gb18030', 'big5']) elif self.language.alpha3 == 'jpn': encodings.append('shift-jis') elif self.language.alpha3 == 'ara': encodings.append('windows-1256') elif self.language.alpha3 == 'heb': encodings.append('windows-1255') elif self.language.alpha3 == 'tur': encodings.extend(['iso-8859-9', 'windows-1254']) elif self.language.alpha3 == 'pol': # Eastern European Group 1 encodings.extend(['windows-1250']) elif self.language.alpha3 == 'bul': # Eastern European Group 2 encodings.extend(['windows-1251']) else: # Western European (windows-1252) encodings.append('latin-1') # try to decode logger.debug('Trying encodings %r', encodings) for encoding in encodings: try: self.content.decode(encoding) except UnicodeDecodeError: pass else: logger.info('Guessed encoding %s', encoding) return encoding logger.warning('Could not guess encoding from language') # fallback on chardet encoding = chardet.detect(self.content)['encoding'] logger.info('Chardet found encoding %s', encoding) return encoding
[docs] def get_matches(self, video, hearing_impaired=False): """Get the matches against the `video`. :param video: the video to get the matches with. :type video: :class:`~subliminal.video.Video` :param bool hearing_impaired: hearing impaired preference. :return: matches of the subtitle. :rtype: set """ matches = set() # hearing_impaired if self.hearing_impaired == hearing_impaired: matches.add('hearing_impaired') return matches
def __hash__(self): return hash(self.provider_name + '-' + self.id) def __repr__(self): return '<%s %r [%s]>' % (self.__class__.__name__, self.id, self.language)
[docs]def compute_score(matches, video, scores=None): """Compute the score of the `matches` against the `video`. Some matches count as much as a combination of others in order to level the final score: * `hash` removes everything else * For :class:`~subliminal.video.Episode` * `imdb_id` removes `series`, `tvdb_id`, `season`, `episode`, `title` and `year` * `tvdb_id` removes `series` and `year` * `title` removes `season` and `episode` :param video: the video to get the score with. :type video: :class:`~subliminal.video.Video` :param dict scores: scores to use, if `None`, the :attr:`~subliminal.video.Video.scores` from the video are used. :return: score of the subtitle. :rtype: int """ final_matches = matches.copy() scores = scores or video.scores logger.info('Computing score for matches %r and %r', matches, video) # remove equivalent match combinations if 'hash' in final_matches: final_matches &= {'hash', 'hearing_impaired'} elif isinstance(video, Episode): if 'imdb_id' in final_matches: final_matches -= {'series', 'tvdb_id', 'season', 'episode', 'title', 'year'} if 'tvdb_id' in final_matches: final_matches -= {'series', 'year'} if 'title' in final_matches: final_matches -= {'season', 'episode'} # compute score logger.debug('Final matches: %r', final_matches) score = sum((scores[match] for match in final_matches)) logger.info('Computed score %d', score) # ensure score is capped by the best possible score (hash + preferences) assert score <= scores['hash'] + scores['hearing_impaired'] return score
[docs]def get_subtitle_path(video_path, language=None, extension='.srt'): """Get the subtitle path using the `video_path` and `language`. :param str video_path: path to the video. :param language: language of the subtitle to put in the path. :type language: :class:`~babelfish.language.Language` :param str extension: extension of the subtitle. :return: path of the subtitle. :rtype: str """ subtitle_root = os.path.splitext(video_path)[0] if language: subtitle_root += '.' + str(language) return subtitle_root + extension
[docs]def guess_matches(video, guess, partial=False): """Get matches between a `video` and a `guess`. If a guess is `partial`, the absence information won't be counted as a match. :param video: the video. :type video: :class:`~subliminal.video.Video` :param guess: the guess. :type guess: dict :param bool partial: whether or not the guess is partial. :return: matches between the `video` and the `guess`. :rtype: set """ matches = set() if isinstance(video, Episode): # series if video.series and 'series' in guess and guess['series'].lower() == video.series.lower(): matches.add('series') # season if video.season and 'season' in guess and guess['season'] == video.season: matches.add('season') # episode if video.episode and 'episodeNumber' in guess and guess['episodeNumber'] == video.episode: matches.add('episode') # year if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') # count "no year" as an information if not partial and video.year is None and 'year' not in guess: matches.add('year') elif isinstance(video, Movie): # year if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') # title if video.title and 'title' in guess and guess['title'].lower() == video.title.lower(): matches.add('title') # release_group if video.release_group and 'releaseGroup' in guess and guess['releaseGroup'].lower() == video.release_group.lower(): matches.add('release_group') # resolution if video.resolution and 'screenSize' in guess and guess['screenSize'] == video.resolution: matches.add('resolution') # format if video.format and 'format' in guess and guess['format'].lower() == video.format.lower(): matches.add('format') # video_codec if video.video_codec and 'videoCodec' in guess and guess['videoCodec'] == video.video_codec: matches.add('video_codec') # audio_codec if video.audio_codec and 'audioCodec' in guess and guess['audioCodec'] == video.audio_codec: matches.add('audio_codec') return matches
[docs]def guess_properties(string): """Extract properties from `string` using guessit's `guess_properties` transformer. :param str string: the string potentially containing properties. :return: the guessed properties. :rtype: dict """ mtree = MatchTree(string) get_transformer('guess_properties').process(mtree) return mtree.matched()
[docs]def fix_line_ending(content): """Fix line ending of `content` by changing it to \n. :param bytes content: content of the subtitle. :return: the content with fixed line endings. :rtype: bytes """ return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')