Skip to content

Commit c3b4422

Browse files
committed
sync politeness updates
1 parent 1df76ce commit c3b4422

File tree

3 files changed

+130
-23
lines changed

3 files changed

+130
-23
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,5 @@ node_modules/
5858
# testing
5959
/output
6060
/vector_data
61-
test.py
61+
test.py
62+
test.ipynb

src/team_comm_tools/features/keywords.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py
12
kw = {
23
"spacy_neg_only": {
34
"Negative_Emotion": [
@@ -7260,7 +7261,34 @@
72607261
" sorry ",
72617262
" woops ",
72627263
" whoops ",
7263-
" oops "
7264+
" oops ",
7265+
" apology "
7266+
],
7267+
"Third_Person": [
7268+
" he ",
7269+
" him ",
7270+
" his ",
7271+
" himself ",
7272+
" she ",
7273+
" her ",
7274+
" hers ",
7275+
" herself ",
7276+
" they ",
7277+
" them ",
7278+
" their ",
7279+
" theirs ",
7280+
" themselves "
7281+
],
7282+
"Contrast_Conjunction": [
7283+
" but ",
7284+
" however ",
7285+
" instead ",
7286+
" although ",
7287+
" even though ",
7288+
" despite ",
7289+
" and yet ",
7290+
" nevertheless ",
7291+
" nonetheless "
72647292
],
72657293
"Ask_Agency": [
72667294
" do me a favor ",
@@ -7365,7 +7393,7 @@
73657393
"Gratitude": [
73667394
" thank ",
73677395
" thanks ",
7368-
" thank you ",
7396+
#" thank you ",
73697397
" grateful ",
73707398
" gratitude ",
73717399
" cheers "
@@ -14419,25 +14447,47 @@
1441914447
" cock ",
1442014448
" crap ",
1442114449
" damn ",
14450+
" dammit ",
14451+
" damnit ",
1442214452
" dick ",
14453+
" dickhead ",
14454+
" dick-head ",
1442314455
" dumb ",
14456+
" dumbass ",
14457+
" dumb-ass ",
14458+
" dumb ass ",
1442414459
" dyke ",
1442514460
" fuck ",
14461+
" fucking ",
14462+
" fucker ",
1442614463
" goddam ",
14464+
" goddammit ",
14465+
" goddamed ",
1442714466
" hell ",
14467+
" horshit ",
1442814468
" homo ",
14469+
" jackass ",
14470+
" jackass ",
14471+
" motherfucker ",
14472+
" mother-fucker ",
14473+
" motherfucking ",
1442914474
" nigger ",
14475+
" nigra ",
1443014476
" piss ",
1443114477
" prick ",
1443214478
" pussy ",
1443314479
" queer ",
1443414480
" screw ",
1443514481
" shit ",
14482+
" shite ",
14483+
" shitting ",
1443614484
" sob ",
14437-
" sonofa ",
1443814485
" suck ",
1443914486
" sucked ",
14440-
" sucks "
14487+
" sucks ",
14488+
" twat ",
14489+
" wanker ",
14490+
" whore "
1444114491
],
1444214492
"Truth_Intensifier": [
1444314493
" really ",

src/team_comm_tools/features/politeness_v2_helper.py

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -218,29 +218,85 @@ def bare_command(doc):
218218
def Question(doc):
219219
"""
220220
Counts the number of sentences containing question words and question marks.
221-
221+
Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py
222222
Args:
223223
doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.
224-
225224
Returns:
226225
tuple: A tuple containing the counts of Yes/No questions and WH-questions.
227226
"""
228-
229-
keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which '])
230-
tags = set(['WRB', 'WP', 'WDT'])
231-
232-
# doc = nlp(text)
233-
sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
234-
all_qs = len(sentences)
235-
236-
n = 0
237-
for i in range(len(sentences)):
238-
whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
239-
240-
if len(whq) > 0:
241-
n += 1
242-
243-
return all_qs - n, n
227+
# POS tags for WH-words like who/what/where
228+
search_tags = {'WRB', 'WP', 'WDT'}
229+
# WH-words and common auxiliaries that follow them in real questions
230+
wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
231+
wh_followers = {
232+
'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'},
233+
'who': {'is', 'are', 'was', 'can', 'should'},
234+
'where': {'is', 'are', 'can', 'should'},
235+
'when': {'is', 'are', 'can', 'should'},
236+
'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'},
237+
'how': {'is', 'are', 'do', 'does', 'can', 'should', 'would'},
238+
'which': {'is', 'are', 'was', 'can', 'should'}
239+
}
240+
# Auxiliaries that typically initiate Yes/No questions
241+
yesno_aux = {'do', 'does', 'did', 'have', 'has', 'had',
242+
# 'can', 'could', 'will', 'would',
243+
'may', 'might', 'shall', 'should',
244+
'is', 'are', 'was', 'were', 'am'}
245+
# Pronouns that often follow auxiliaries in Yes/No questions
246+
pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'}
247+
# filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'}
248+
249+
wh_count = 0
250+
yesno_count = 0
251+
counted_sentences = set()
252+
for sent in doc.sents:
253+
sent_text = sent.text.strip()
254+
sent_tokens = list(sent)
255+
if not sent_tokens:
256+
continue
257+
# Method 1: Find question sentences by checking for '?' at end
258+
if sent_text.endswith('?'):
259+
# try to find the first WH-word in the sentence
260+
wh = False
261+
for token in sent_tokens:
262+
if token.text.lower() in wh_words and token.tag_ in search_tags and token.dep_ not in {"relcl", "acl"}\
263+
and token.i < sent.root.i:
264+
wh = True
265+
break
266+
if wh:
267+
wh_count += 1
268+
else:
269+
# Fallback: no WH in the sentence → treat as Yes/No question
270+
yesno_count += 1
271+
counted_sentences.add(sent.start)
272+
continue
273+
# Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access
274+
for i in range(len(sent_tokens) - 1):
275+
tok1 = sent_tokens[i]
276+
tok2 = sent_tokens[i + 1]
277+
t1_lower = tok1.text.lower()
278+
t2_lower = tok2.text.lower()
279+
if sent.start in counted_sentences:
280+
break # already counted
281+
# WH pattern
282+
if t1_lower in wh_words and t2_lower in wh_followers.get(t1_lower, set()):
283+
wh_count += 1
284+
counted_sentences.add(sent.start)
285+
break
286+
# Yes/No pattern
287+
if t1_lower in yesno_aux and t2_lower in pronoun_followers:
288+
yesno_count += 1
289+
counted_sentences.add(sent.start)
290+
break
291+
return yesno_count, wh_count
292+
# sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
293+
# all_qs = len(sentences)
294+
# n = 0
295+
# for i in range(len(sentences)):
296+
# whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
297+
# if len(whq) > 0:
298+
# n += 1
299+
# return all_qs - n, n
244300

245301

246302
def word_start(keywords, doc):

0 commit comments

Comments
 (0)