NLPFinalProject/Preprocessing.py at main · gencnis/NLPFinalProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'''
This document 'Preprocessing.py' preprocess the document given by casefolding, tokenization, removing the punctuations.

@author: Nisanur Genc
@author: Alex Wills
'''

from nltk import word_tokenize, sent_tokenize
import os
import re

def preprocess(document):
    '''
    Preprocesses a document and returns a list of sentences in the document.
    Sentences in this list are not tokenized and still contain punctuation.
    The first and last sentences are edited specifically for the corpus generated by CreateCorpus.py with
    the Stardew Valley Wiki, whose first and last sentences are not the same as the first and last sentences
    generated by nltk's sent_tokenize.

    @param document - the document to analyze in the form of a string.

    @return - a list containing the sentences in the document.
    '''
    # Casefold
    text = document.casefold()

    # Get sentences
    sentences = sent_tokenize(text)

    # First sentence is expected to be mixed with header, so we isolate it
    firstSentence = re.findall(r"\n.+\.", sentences[0])
    if len(firstSentence) > 0:
        sentences[0] = firstSentence[-1][1:]    # True first sentence is the last line that ends with a "."

    # Cut off last 2 sentences, which aren't part of the article
    return sentences[:-2]


def tokenize_sentence(sentence):
    '''
    Tokenizes and further processes a sentence to remove punctuation and split it into a list of words.

    @param sentence - the sentence (string) to tokenize into words
    @return - a list of casefolded words without punctuation.
    '''
    sentence = sentence.casefold()

    words = []

    # Remove punctuation
    punc = '''!()-[]{};:'"’“”\,<>./?@#$%^&*_~'''
    for character in punc:
        sentence.replace(character, "")
        words = word_tokenize(sentence)

    return words


def main():

    # open file for testing purposes
    rootDir = os.path.dirname(__file__)

    # with testing purposes, we opened a random file here
    with open(os.path.join(os.path.dirname(__file__), "Corpus/Abigail.txt"), mode = "r", encoding="utf-8") as file:
        document = preprocess(file.read())
        print("hi")

if __name__ == "__main__":
    main()