Module ktrain.text.translation.core

Expand source code
from ... import utils as U
from ...imports import *
from ...torch_base import TorchBase
from .. import textutils as TU

SUPPORTED_SRC_LANGS = ["zh", "ar", "ru", "de", "af", "es", "fr", "it", "pt"]


class Translator(TorchBase):
    """
    Translator: basic wrapper around MarianMT model for language translation
    """

    def __init__(self, model_name=None, device=None, quantize=False):
        """
        ```
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """
        if "Helsinki-NLP" not in model_name:
            warnings.warn(
                "Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP"
            )
        super().__init__(device=device, quantize=quantize)
        from transformers import MarianMTModel, MarianTokenizer

        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
        if quantize:
            self.model = self.quantize_model(self.model)

    def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
        """
        ```
        Translate document (src_text).
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          src_text(str): source text.
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs.
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        sentences = TU.sent_tokenize(src_text)
        tgt_sentences = self.translate_sentences(
            sentences, num_beams=num_beams, early_stopping=early_stopping
        )
        return join_with.join(tgt_sentences)

    def translate_sentences(self, sentences, num_beams=1, early_stopping=False):
        """
        ```
        Translate sentences using model_name as model.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          sentences(list): list of strings representing sentences that need to be translated
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated sentences
        ```
        """
        import torch

        with torch.no_grad():
            translated = self.model.generate(
                **self.tokenizer.prepare_seq2seq_batch(
                    sentences, return_tensors="pt"
                ).to(self.torch_device),
                num_beams=num_beams,
                early_stopping=early_stopping
            )
            tgt_sentences = [
                self.tokenizer.decode(t, skip_special_tokens=True) for t in translated
            ]
        return tgt_sentences


class EnglishTranslator:
    """
    Class to translate text in various languages to English.
    """

    def __init__(self, src_lang=None, device=None, quantize=False):
        """
        ```
        Constructor for English translator

        Args:
          src_lang(str): language code of source language.
                         Must be one of SUPPORTED_SRC_LANGS:
                           'zh': Chinese (either tradtional or simplified)
                           'ar': Arabic
                           'ru' : Russian
                           'de': German
                           'af': Afrikaans
                           'es': Spanish
                           'fr': French
                           'it': Italian
                           'pt': Portuguese
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """

        if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
            raise ValueError(
                "A src_lang must be supplied and be one of: %s" % (SUPPORTED_SRC_LANGS)
            )
        self.src_lang = src_lang
        self.translators = []
        if src_lang == "ar":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ar-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "ru":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ru-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "de":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-de-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "af":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-af-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang in ["es", "fr", "it", "pt"]:
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ROMANCE-en",
                    device=device,
                    quantize=quantize,
                )
            )
        # elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German
        # self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device))
        # self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device))
        elif src_lang == "zh":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-zh-en",
                    device=device,
                    quantize=quantize,
                )
            )
        else:
            raise ValueError("lang:%s is currently not supported." % (src_lang))

    def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
        """
        ```
        Translate source document to English.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

        Args:
          src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs.
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        text = src_text
        for t in self.translators:
            text = t.translate(
                text,
                join_with=join_with,
                num_beams=num_beams,
                early_stopping=early_stopping,
            )
        return text

Classes

class EnglishTranslator (src_lang=None, device=None, quantize=False)

Class to translate text in various languages to English.

Constructor for English translator

Args:
  src_lang(str): language code of source language.
                 Must be one of SUPPORTED_SRC_LANGS:
                   'zh': Chinese (either tradtional or simplified)
                   'ar': Arabic
                   'ru' : Russian
                   'de': German
                   'af': Afrikaans
                   'es': Spanish
                   'fr': French
                   'it': Italian
                   'pt': Portuguese
  device(str): device to use (e.g., 'cuda', 'cpu')
  quantize(bool): If True, use quantization.
Expand source code
class EnglishTranslator:
    """
    Class to translate text in various languages to English.
    """

    def __init__(self, src_lang=None, device=None, quantize=False):
        """
        ```
        Constructor for English translator

        Args:
          src_lang(str): language code of source language.
                         Must be one of SUPPORTED_SRC_LANGS:
                           'zh': Chinese (either tradtional or simplified)
                           'ar': Arabic
                           'ru' : Russian
                           'de': German
                           'af': Afrikaans
                           'es': Spanish
                           'fr': French
                           'it': Italian
                           'pt': Portuguese
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """

        if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
            raise ValueError(
                "A src_lang must be supplied and be one of: %s" % (SUPPORTED_SRC_LANGS)
            )
        self.src_lang = src_lang
        self.translators = []
        if src_lang == "ar":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ar-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "ru":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ru-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "de":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-de-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang == "af":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-af-en",
                    device=device,
                    quantize=quantize,
                )
            )
        elif src_lang in ["es", "fr", "it", "pt"]:
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-ROMANCE-en",
                    device=device,
                    quantize=quantize,
                )
            )
        # elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German
        # self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device))
        # self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device))
        elif src_lang == "zh":
            self.translators.append(
                Translator(
                    model_name="Helsinki-NLP/opus-mt-zh-en",
                    device=device,
                    quantize=quantize,
                )
            )
        else:
            raise ValueError("lang:%s is currently not supported." % (src_lang))

    def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
        """
        ```
        Translate source document to English.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

        Args:
          src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs.
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        text = src_text
        for t in self.translators:
            text = t.translate(
                text,
                join_with=join_with,
                num_beams=num_beams,
                early_stopping=early_stopping,
            )
        return text

Methods

def translate(self, src_text, join_with='\n', num_beams=1, early_stopping=False)
Translate source document to English.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

Args:
  src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs.
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
Expand source code
def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
    """
    ```
    Translate source document to English.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

    Args:
      src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs.
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    """
    text = src_text
    for t in self.translators:
        text = t.translate(
            text,
            join_with=join_with,
            num_beams=num_beams,
            early_stopping=early_stopping,
        )
    return text
class Translator (model_name=None, device=None, quantize=False)

Translator: basic wrapper around MarianMT model for language translation

basic wrapper around MarianMT model for language translation

Args:
  model_name(str): Helsinki-NLP model
  device(str): device to use (e.g., 'cuda', 'cpu')
  quantize(bool): If True, use quantization.
Expand source code
class Translator(TorchBase):
    """
    Translator: basic wrapper around MarianMT model for language translation
    """

    def __init__(self, model_name=None, device=None, quantize=False):
        """
        ```
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """
        if "Helsinki-NLP" not in model_name:
            warnings.warn(
                "Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP"
            )
        super().__init__(device=device, quantize=quantize)
        from transformers import MarianMTModel, MarianTokenizer

        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
        if quantize:
            self.model = self.quantize_model(self.model)

    def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
        """
        ```
        Translate document (src_text).
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          src_text(str): source text.
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs.
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        sentences = TU.sent_tokenize(src_text)
        tgt_sentences = self.translate_sentences(
            sentences, num_beams=num_beams, early_stopping=early_stopping
        )
        return join_with.join(tgt_sentences)

    def translate_sentences(self, sentences, num_beams=1, early_stopping=False):
        """
        ```
        Translate sentences using model_name as model.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          sentences(list): list of strings representing sentences that need to be translated
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated sentences
        ```
        """
        import torch

        with torch.no_grad():
            translated = self.model.generate(
                **self.tokenizer.prepare_seq2seq_batch(
                    sentences, return_tensors="pt"
                ).to(self.torch_device),
                num_beams=num_beams,
                early_stopping=early_stopping
            )
            tgt_sentences = [
                self.tokenizer.decode(t, skip_special_tokens=True) for t in translated
            ]
        return tgt_sentences

Ancestors

Methods

def translate(self, src_text, join_with='\n', num_beams=1, early_stopping=False)
Translate document (src_text).
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  src_text(str): source text.
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs.
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
Expand source code
def translate(self, src_text, join_with="\n", num_beams=1, early_stopping=False):
    """
    ```
    Translate document (src_text).
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      src_text(str): source text.
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs.
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    """
    sentences = TU.sent_tokenize(src_text)
    tgt_sentences = self.translate_sentences(
        sentences, num_beams=num_beams, early_stopping=early_stopping
    )
    return join_with.join(tgt_sentences)
def translate_sentences(self, sentences, num_beams=1, early_stopping=False)
Translate sentences using model_name as model.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  sentences(list): list of strings representing sentences that need to be translated
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated sentences
Expand source code
def translate_sentences(self, sentences, num_beams=1, early_stopping=False):
    """
    ```
    Translate sentences using model_name as model.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      sentences(list): list of strings representing sentences that need to be translated
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1,
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated sentences
    ```
    """
    import torch

    with torch.no_grad():
        translated = self.model.generate(
            **self.tokenizer.prepare_seq2seq_batch(
                sentences, return_tensors="pt"
            ).to(self.torch_device),
            num_beams=num_beams,
            early_stopping=early_stopping
        )
        tgt_sentences = [
            self.tokenizer.decode(t, skip_special_tokens=True) for t in translated
        ]
    return tgt_sentences

Inherited members