-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreprocessing.py
More file actions
71 lines (51 loc) · 2.16 KB
/
Preprocessing.py
File metadata and controls
71 lines (51 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'''
This document 'Preprocessing.py' preprocess the document given by casefolding, tokenization, removing the punctuations.
@author: Nisanur Genc
@author: Alex Wills
'''
from nltk import word_tokenize, sent_tokenize
import os
import re
def preprocess(document):
'''
Preprocesses a document and returns a list of sentences in the document.
Sentences in this list are not tokenized and still contain punctuation.
The first and last sentences are edited specifically for the corpus generated by CreateCorpus.py with
the Stardew Valley Wiki, whose first and last sentences are not the same as the first and last sentences
generated by nltk's sent_tokenize.
@param document - the document to analyze in the form of a string.
@return - a list containing the sentences in the document.
'''
# Casefold
text = document.casefold()
# Get sentences
sentences = sent_tokenize(text)
# First sentence is expected to be mixed with header, so we isolate it
firstSentence = re.findall(r"\n.+\.", sentences[0])
if len(firstSentence) > 0:
sentences[0] = firstSentence[-1][1:] # True first sentence is the last line that ends with a "."
# Cut off last 2 sentences, which aren't part of the article
return sentences[:-2]
def tokenize_sentence(sentence):
'''
Tokenizes and further processes a sentence to remove punctuation and split it into a list of words.
@param sentence - the sentence (string) to tokenize into words
@return - a list of casefolded words without punctuation.
'''
sentence = sentence.casefold()
words = []
# Remove punctuation
punc = '''!()-[]{};:'"’“”\,<>./?@#$%^&*_~'''
for character in punc:
sentence.replace(character, "")
words = word_tokenize(sentence)
return words
def main():
# open file for testing purposes
rootDir = os.path.dirname(__file__)
# with testing purposes, we opened a random file here
with open(os.path.join(os.path.dirname(__file__), "Corpus/Abigail.txt"), mode = "r", encoding="utf-8") as file:
document = preprocess(file.read())
print("hi")
if __name__ == "__main__":
main()