Source code for text_processing_util_mds24.text_processing_util_mds24

import string
import numpy as np
from collections import Counter


[docs] def text_clean(docs: list[str]) -> list[list[str]]: """Removes punctuation, turns all characters in each document to lower case, \ removes numbers in documents, and splits each document into a list of words. Parameters ---------- docs : list[str] Documents to be processed. Each item in the list is a document. Returns ------- list[list[str]] Cleaned documents. Examples -------- >>> text_clean(["We are group 10.", "We are the best!"]) [["we", "are", "group"], ["we", "are", "the", "best"]] """ if not isinstance(docs, list): raise TypeError('Input is not a list') for doc_i, doc in enumerate(docs): if not isinstance(doc, str): raise TypeError(f'Document {doc_i} is not string') cleaned_docs: list[str] = [] for _ in docs: cleaned_docs.append('') # remove punctuation and number and lower case everything for doc_i, doc in enumerate(docs): for ch in doc: if not ch.isnumeric() and ch not in string.punctuation: cleaned_docs[doc_i] += ch.lower() out_docs: list[list[str]] = [] # split by space for doc in cleaned_docs: out_docs.append([wrd for wrd in doc.split(' ') if wrd != '']) return out_docs
[docs] def frequency_vectorizer(docs: list[str]) -> tuple[np.ndarray, np.ndarray]: """ Calculates the frequency of each word in a list of text documents. Parameters ---------- docs : list[str] A list of text documents. Returns ------- tuple[np.ndarray, np.ndarray] Tuple containing two elements: - A 2D array containing frequency scores for each term in each document. - An array of feature names corresponding to the columns in the frequency matrix. Examples -------- >>> docs = ["This is a sample document.", "Another document for testing."] >>> result_tf_matrix, result_feature_names = frequency_vectorizer(documents) >>> print("Frequency Matrix:") >>> print(result_tf_matrix) Frequency Matrix: [[0.2 0. 0.2 0. 0.2 0.2 0. 0.2 ] [0. 0.25 0.25 0.25 0. 0. 0.25 0. ]] >>> print("Feature Names:") >>> print(result_feature_names) Feature Names: ['a', 'another', 'document', 'for', 'is', 'sample', 'testing', 'this'] """ cleaned_docs = text_clean(docs) if len(docs) == 0: return np.array([]), np.array([]) # Calculate frequency tf_matrix = np.zeros((len(docs), len(set(term for doc in cleaned_docs for term in doc)))) feature_names = sorted(set(term for doc in cleaned_docs for term in doc)) for i, doc in enumerate(cleaned_docs): term_count = Counter(doc) total_terms = len(doc) for j, term in enumerate(feature_names): if total_terms == 0: tf_matrix[i, j] = 0 else: tf_matrix[i, j] = term_count.get(term, 0) / total_terms return tf_matrix, feature_names
[docs] def tfidf_vectorizer(docs: list[str]) -> tuple[np.ndarray, np.ndarray]: """ Calculates TF-IDF scores for a list of documents. The TF-IDF score measures \ the importance of a word to its document, adjusted for the word's overall \ frequency in all documents. Parameters ---------- docs : list[str] A list of documents (strings). Returns ------- tuple[np.ndarray, np.ndarray] Tuple containing two elements: - A 2D array containing TF-IDF scores for each term in each document. - An array of feature names corresponding to the columns in the TF-IDF matrix. Examples -------- >>> docs = ["Machine learning is interesting", "Python is widely used in machine learning"] >>> tdifd_matrix, feature_names = tfidf_vectorizer(docs) >>> print("TFIDF Matrix:") >>> print(tdifd_matrix) [[0. , 0.43550663, 0.43550663, 0.43550663, 0.43550663, 0.43550663] [0.57735027, 0. , 0. , 0. , 0. , 0. ]] >>> print(Feature Names:) >>> print(feature_names) ['in', 'interesting', 'is', 'learning', 'machine', 'python'] """ # Clean the documents cleaned_docs = text_clean(docs) # Calculate term frequency (TF) tf = [{term: count / len(doc) for term, count in Counter(doc).items()} for doc in cleaned_docs] # Calculate document frequency (DF) df = Counter() for doc in cleaned_docs: df.update(set(doc)) # Calculate inverse document frequency (IDF) idf = {term: np.log(len(docs) / (df[term] + 1)) for term in df} # Calculate TF-IDF tfidf_matrix = np.zeros((len(docs), len(idf))) feature_names = sorted(idf.keys()) for i, doc in enumerate(cleaned_docs): for j, term in enumerate(feature_names): tfidf_matrix[i, j] = tf[i].get(term, 0) * idf[term] return tfidf_matrix, feature_names
[docs] def tokenizer_padding(docs: list[str]) -> np.ndarray: """ Converts each text document into a list of numerical tokens, which are \ numerical identifiers for each word, and pads shorter sequences so that \ each tokenized document has the same length. These steps make it possible \ for the transformed data to be accepted by deep learning libraries for \ building recurrent neural networks. Parameters ---------- docs : list[str] A list of text documents. Returns ------- np.ndarray 2D array of tokenized and padded sequences of the input documents. Examples -------- >>> tokenized_padded = tokenizer_padding(["the first sentence", "the second longer sentence"]) >>> print(tokenized_padded) [[1, 2, 3, 0], [1, 4, 5, 3]] >>> tokenized_padded = tokenizer_padding(["a sample text", "sample text two"]) >>> print(tokenized_padded) [[1, 2, 3], [2, 3, 4]] """ cleaned = text_clean(docs) if len(docs) == 0: return np.array([]) max_len = max([len(doc) for doc in cleaned]) mapper = {} max_token = 1 ret_array = np.zeros((len(cleaned), max_len)) for i in range(len(cleaned)): for j in range(len(cleaned[i])): if cleaned[i][j] not in mapper: mapper[cleaned[i][j]] = max_token max_token += 1 ret_array[i,j] = mapper[cleaned[i][j]] return ret_array