31 lines
870 B
Python
31 lines
870 B
Python
# Import libraries
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
# Ensure you have the necessary NLTK resources downloaded
|
|
nltk.download('punkt')
|
|
nltk.download('stopwords')
|
|
|
|
def remove_stop_words(text):
|
|
# Tokenizing the text into words
|
|
words = word_tokenize(text)
|
|
|
|
# Defining the English stop words
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
# Removing stop words from the text
|
|
filtered_words = [word for word in words if word.lower() not in stop_words]
|
|
|
|
return ' '.join(filtered_words)
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
input_text = "This is an example of a text document that needs stop word removal."
|
|
preprocessed_text = remove_stop_words(input_text)
|
|
print("Original Text:")
|
|
print(input_text)
|
|
print("\nPreprocessed Text:")
|
|
print(preprocessed_text)
|
|
|