ToolsForTextAnalysis/strip_pos.py at master · mwidner/ToolsForTextAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
'''
Strip parts of speech we don't want
preps corpus for topic modelling, other processing

mlw
'''

import os
import csv
from bs4 import BeautifulSoup

BASE_DIR = './'
INPUT_DIR = BASE_DIR + 'results/treetagger/year/'
OUTPUT_DIR = BASE_DIR + 'results/stripped/year/'

if not os.path.isdir(OUTPUT_DIR):
	os.makedirs(OUTPUT_DIR)

filelist = csv.writer(open(OUTPUT_DIR + 'filelist.csv', 'w'))
filelist.writerow(["filename","words"])

for filename in os.listdir(INPUT_DIR):
	print(filename)
	fh = open(INPUT_DIR + filename, "r")
	text = fh.read()
	fh.close()
	soup = BeautifulSoup(text)
	# nouns = list()
	# adjectives = list()
	words = list()
	for phrase in soup.find_all(['s']):
		for line in phrase.get_text().split('\n'):
			tokens = line.split('\t')
			if len(tokens) < 3:
				continue
			choice = list()
			choice = tokens[2].split('|')
			if len(choice) > 1:
				tokens[2] = choice[0]	# default to the first option
			# if tokens[1] == 'NOM':
				# nouns.append(tokens[2])
			if tokens[1] == 'NUM' or len(tokens[2]) == 0:
				words.append(tokens[0])
			else:
				words.append(tokens[2])
			# if tokens[1] == 'ADJ':
				# words.append(tokens[2])
				# adjectives.append(tokens[2])
	outfile = OUTPUT_DIR + filename
	fh = open(outfile, "w")
	fh.write(' '.join(words))
	fh.close()
	filelist.writerow([outfile, len(words)])