Skip to content

features

textual

transformers

TransformerFeatureExtractor

Extracts features from input texts using transformer embeddings.

Source code in aimet_ml/features/textual/transformers.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class TransformerFeatureExtractor:
    """Extracts features from input texts using transformer embeddings."""

    def __init__(
        self,
        model_name: str,
        num_emb_layers: int = 4,
        max_length: int = 512,
        device: Union[str, torch.device] = "cuda:0",
    ):
        """
        Initializes the TransformerFeatureExtractor.

        Args:
            model_name (str): The name or path of the pre-trained transformer model.
            num_emb_layers (int, optional): Number of layers to use for feature extraction. Default is 4.
            max_length (int, optional): Maximum length of input text for tokenization. Default is 512.
            device (str or torch.device, optional): Device to use for computation ('cuda:0', 'cpu', etc.).
                Default is 'cuda:0' if available, else 'cpu'.
        """

        if not torch.cuda.is_available():
            device = "cpu"

        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(device)

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.num_emb_layers = num_emb_layers
        self.max_length = max_length

    def extract_features(self, texts: Union[str, List[str]]) -> torch.Tensor:
        """
        Extracts features from input texts using transformer embeddings.

        Args:
            texts (str or list): Input text or list of texts for feature extraction.

        Returns:
            torch.Tensor: Extracted features for input texts.
        """
        self.model.eval()

        if isinstance(texts, str):
            texts = [texts]

        input_ids, attention_masks = [], []
        for text in texts:
            tokenized_output = self.tokenize(text)
            input_ids.append(tokenized_output["input_ids"])
            attention_masks.append(tokenized_output["attention_mask"])
        input_ids_tensor = torch.cat(input_ids, dim=0).to(self.model.device)
        attention_masks_tensor = torch.cat(attention_masks, dim=0).to(self.model.device)

        with torch.no_grad():
            hidden_states = self.model(
                input_ids_tensor, attention_mask=attention_masks_tensor, output_hidden_states=True
            )["hidden_states"]

        embeddings = sum(hidden_states[-i][:, 0, :] for i in range(1, self.num_emb_layers + 1))
        embeddings = embeddings.detach().cpu()

        return embeddings

    def tokenize(self, text: str) -> dict:
        """
        Tokenizes input text using the transformer's tokenizer.

        Args:
            text (str): Input text to be tokenized.

        Returns:
            dict: Dictionary containing tokenized input with attention mask.
        """
        tokenized_output = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return tokenized_output.data
__init__(model_name, num_emb_layers=4, max_length=512, device='cuda:0')

Initializes the TransformerFeatureExtractor.

Parameters:

Name Type Description Default
model_name str

The name or path of the pre-trained transformer model.

required
num_emb_layers int

Number of layers to use for feature extraction. Default is 4.

4
max_length int

Maximum length of input text for tokenization. Default is 512.

512
device str or device

Device to use for computation ('cuda:0', 'cpu', etc.). Default is 'cuda:0' if available, else 'cpu'.

'cuda:0'
Source code in aimet_ml/features/textual/transformers.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(
    self,
    model_name: str,
    num_emb_layers: int = 4,
    max_length: int = 512,
    device: Union[str, torch.device] = "cuda:0",
):
    """
    Initializes the TransformerFeatureExtractor.

    Args:
        model_name (str): The name or path of the pre-trained transformer model.
        num_emb_layers (int, optional): Number of layers to use for feature extraction. Default is 4.
        max_length (int, optional): Maximum length of input text for tokenization. Default is 512.
        device (str or torch.device, optional): Device to use for computation ('cuda:0', 'cpu', etc.).
            Default is 'cuda:0' if available, else 'cpu'.
    """

    if not torch.cuda.is_available():
        device = "cpu"

    self.model = AutoModel.from_pretrained(model_name)
    self.model.to(device)

    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.num_emb_layers = num_emb_layers
    self.max_length = max_length
extract_features(texts)

Extracts features from input texts using transformer embeddings.

Parameters:

Name Type Description Default
texts str or list

Input text or list of texts for feature extraction.

required

Returns:

Type Description
Tensor

torch.Tensor: Extracted features for input texts.

Source code in aimet_ml/features/textual/transformers.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def extract_features(self, texts: Union[str, List[str]]) -> torch.Tensor:
    """
    Extracts features from input texts using transformer embeddings.

    Args:
        texts (str or list): Input text or list of texts for feature extraction.

    Returns:
        torch.Tensor: Extracted features for input texts.
    """
    self.model.eval()

    if isinstance(texts, str):
        texts = [texts]

    input_ids, attention_masks = [], []
    for text in texts:
        tokenized_output = self.tokenize(text)
        input_ids.append(tokenized_output["input_ids"])
        attention_masks.append(tokenized_output["attention_mask"])
    input_ids_tensor = torch.cat(input_ids, dim=0).to(self.model.device)
    attention_masks_tensor = torch.cat(attention_masks, dim=0).to(self.model.device)

    with torch.no_grad():
        hidden_states = self.model(
            input_ids_tensor, attention_mask=attention_masks_tensor, output_hidden_states=True
        )["hidden_states"]

    embeddings = sum(hidden_states[-i][:, 0, :] for i in range(1, self.num_emb_layers + 1))
    embeddings = embeddings.detach().cpu()

    return embeddings
tokenize(text)

Tokenizes input text using the transformer's tokenizer.

Parameters:

Name Type Description Default
text str

Input text to be tokenized.

required

Returns:

Name Type Description
dict dict

Dictionary containing tokenized input with attention mask.

Source code in aimet_ml/features/textual/transformers.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def tokenize(self, text: str) -> dict:
    """
    Tokenizes input text using the transformer's tokenizer.

    Args:
        text (str): Input text to be tokenized.

    Returns:
        dict: Dictionary containing tokenized input with attention mask.
    """
    tokenized_output = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

    return tokenized_output.data