-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstrip_pos.py
More file actions
53 lines (47 loc) · 1.29 KB
/
strip_pos.py
File metadata and controls
53 lines (47 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
'''
Strip parts of speech we don't want
preps corpus for topic modelling, other processing
mlw
'''
import os
import csv
from bs4 import BeautifulSoup
BASE_DIR = './'
INPUT_DIR = BASE_DIR + 'results/treetagger/year/'
OUTPUT_DIR = BASE_DIR + 'results/stripped/year/'
if not os.path.isdir(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
filelist = csv.writer(open(OUTPUT_DIR + 'filelist.csv', 'w'))
filelist.writerow(["filename","words"])
for filename in os.listdir(INPUT_DIR):
print(filename)
fh = open(INPUT_DIR + filename, "r")
text = fh.read()
fh.close()
soup = BeautifulSoup(text)
# nouns = list()
# adjectives = list()
words = list()
for phrase in soup.find_all(['s']):
for line in phrase.get_text().split('\n'):
tokens = line.split('\t')
if len(tokens) < 3:
continue
choice = list()
choice = tokens[2].split('|')
if len(choice) > 1:
tokens[2] = choice[0] # default to the first option
# if tokens[1] == 'NOM':
# nouns.append(tokens[2])
if tokens[1] == 'NUM' or len(tokens[2]) == 0:
words.append(tokens[0])
else:
words.append(tokens[2])
# if tokens[1] == 'ADJ':
# words.append(tokens[2])
# adjectives.append(tokens[2])
outfile = OUTPUT_DIR + filename
fh = open(outfile, "w")
fh.write(' '.join(words))
fh.close()
filelist.writerow([outfile, len(words)])