Source code for text_processing_util_mds24.text_processing_util_mds24

import string
import numpy as np
from collections import Counter



[docs]
def text_clean(docs: list[str]) -> list[list[str]]:
    """Removes punctuation, turns all characters in each document to lower case, \
       removes numbers in documents, and splits each document into a list of words.

    Parameters
    ----------
    docs : list[str]
        Documents to be processed.
        Each item in the list is a document.

    Returns
    -------
    list[list[str]]
        Cleaned documents.

    Examples
    --------
    >>> text_clean(["We are group 10.", "We are the best!"])
    [["we", "are", "group"], ["we", "are", "the", "best"]]
    """

    if not isinstance(docs, list):
        raise TypeError('Input is not a list')
    for doc_i, doc in enumerate(docs):
        if not isinstance(doc, str):
            raise TypeError(f'Document {doc_i} is not string')

    cleaned_docs: list[str] = []

    for _ in docs:
        cleaned_docs.append('')

    # remove punctuation and number and lower case everything
    for doc_i, doc in enumerate(docs):
        for ch in doc:
            if not ch.isnumeric() and ch not in string.punctuation:
                cleaned_docs[doc_i] += ch.lower()

    out_docs: list[list[str]] = []

    # split by space
    for doc in cleaned_docs:
        out_docs.append([wrd for wrd in doc.split(' ') if wrd != ''])

    return out_docs




[docs]
def frequency_vectorizer(docs: list[str]) -> tuple[np.ndarray, np.ndarray]:
    """
    Calculates the frequency of each word in a list of text documents.

    Parameters
    ----------
    docs : list[str]
        A list of text documents.

    Returns
    -------
    tuple[np.ndarray, np.ndarray]
        Tuple containing two elements:
            - A 2D array containing frequency scores for each term in each document.
            - An array of feature names corresponding to the columns in the frequency matrix.

    Examples
    --------
    >>> docs = ["This is a sample document.", "Another document for testing."]
    >>> result_tf_matrix, result_feature_names = frequency_vectorizer(documents)
    >>> print("Frequency Matrix:")
    >>> print(result_tf_matrix)
    Frequency Matrix:
    [[0.2  0.   0.2  0.   0.2  0.2  0.   0.2 ]
     [0.   0.25 0.25 0.25 0.   0.   0.25 0.  ]]
    >>> print("Feature Names:")
    >>> print(result_feature_names)
    Feature Names:
    ['a', 'another', 'document', 'for', 'is', 'sample', 'testing', 'this']
    """

    cleaned_docs = text_clean(docs)

    if len(docs) == 0:
        return np.array([]), np.array([])

    # Calculate frequency 
    tf_matrix = np.zeros((len(docs), len(set(term for doc in cleaned_docs for term in doc))))
    feature_names = sorted(set(term for doc in cleaned_docs for term in doc))

    for i, doc in enumerate(cleaned_docs):
        term_count = Counter(doc)
        total_terms = len(doc)

        for j, term in enumerate(feature_names):
            if total_terms == 0:
                tf_matrix[i, j] = 0
            else:
                tf_matrix[i, j] = term_count.get(term, 0) / total_terms

    return tf_matrix, feature_names




[docs]
def tfidf_vectorizer(docs: list[str]) -> tuple[np.ndarray, np.ndarray]:
    """
    Calculates TF-IDF scores for a list of documents. The TF-IDF score measures \
    the importance of a word to its document, adjusted for the word's overall \
    frequency in all documents.

    Parameters
    ----------
    docs : list[str]
        A list of documents (strings).

    Returns
    -------
    tuple[np.ndarray, np.ndarray]
        Tuple containing two elements:
            - A 2D array containing TF-IDF scores for each term in each document.
            - An array of feature names corresponding to the columns in the TF-IDF matrix.

    Examples
    --------
    >>> docs = ["Machine learning is interesting", "Python is widely used in machine learning"]
    >>> tdifd_matrix, feature_names = tfidf_vectorizer(docs)
    >>> print("TFIDF Matrix:")
    >>> print(tdifd_matrix)
    [[0.        , 0.43550663, 0.43550663, 0.43550663, 0.43550663, 0.43550663]
     [0.57735027, 0.        , 0.        , 0.        , 0.        , 0.        ]]
    >>> print(Feature Names:)
    >>> print(feature_names)
    ['in', 'interesting', 'is', 'learning', 'machine', 'python']
    """
    # Clean the documents
    cleaned_docs = text_clean(docs)
    
    # Calculate term frequency (TF)
    tf = [{term: count / len(doc) for term, count in Counter(doc).items()} for doc in cleaned_docs]
    
    # Calculate document frequency (DF)
    df = Counter()
    
    for doc in cleaned_docs:
        df.update(set(doc))
    
    # Calculate inverse document frequency (IDF)
    idf = {term: np.log(len(docs) / (df[term] + 1)) for term in df}
    
    # Calculate TF-IDF
    tfidf_matrix = np.zeros((len(docs), len(idf)))
    feature_names = sorted(idf.keys())
    
    for i, doc in enumerate(cleaned_docs):
        for j, term in enumerate(feature_names):
            tfidf_matrix[i, j] = tf[i].get(term, 0) * idf[term]
    
    return tfidf_matrix, feature_names




[docs]
def tokenizer_padding(docs: list[str]) -> np.ndarray:
    """
    Converts each text document into a list of numerical tokens, which are \
    numerical identifiers for each word, and pads shorter sequences so that \
    each tokenized document has the same length. These steps make it possible \
    for the transformed data to be accepted by deep learning libraries for \
    building recurrent neural networks.

    Parameters
    ----------
    docs : list[str]
        A list of text documents.

    Returns
    -------
    np.ndarray
        2D array of tokenized and padded sequences of the input documents.

    Examples
    --------
    >>> tokenized_padded = tokenizer_padding(["the first sentence", "the second longer sentence"])
    >>> print(tokenized_padded)
    [[1, 2, 3, 0], [1, 4, 5, 3]]
    >>> tokenized_padded = tokenizer_padding(["a sample text", "sample text two"])
    >>> print(tokenized_padded)
    [[1, 2, 3], [2, 3, 4]]
    """
    cleaned = text_clean(docs)

    if len(docs) == 0:
        return np.array([])

    max_len = max([len(doc) for doc in cleaned])
    mapper = {}
    max_token = 1
    ret_array = np.zeros((len(cleaned), max_len))

    for i in range(len(cleaned)):
        for j in range(len(cleaned[i])):
            if cleaned[i][j] not in mapper:
                mapper[cleaned[i][j]] = max_token
                max_token += 1
            ret_array[i,j] = mapper[cleaned[i][j]]

    return ret_array