@@ -218,29 +218,85 @@ def bare_command(doc):
218218def Question (doc ):
219219 """
220220 Counts the number of sentences containing question words and question marks.
221-
221+ Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py
222222 Args:
223223 doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.
224-
225224 Returns:
226225 tuple: A tuple containing the counts of Yes/No questions and WH-questions.
227226 """
228-
229- keywords = set ([' who ' , ' what ' , ' where ' , ' when ' , ' why ' , ' how ' , ' which ' ])
230- tags = set (['WRB' , 'WP' , 'WDT' ])
231-
232- # doc = nlp(text)
233- sentences = [str (sent ) for sent in doc .sents if '?' in str (sent )]
234- all_qs = len (sentences )
235-
236- n = 0
237- for i in range (len (sentences )):
238- whq = [token .tag_ for token in nlp (sentences [i ]) if token .tag_ in tags ]
239-
240- if len (whq ) > 0 :
241- n += 1
242-
243- return all_qs - n , n
227+ # POS tags for WH-words like who/what/where
228+ search_tags = {'WRB' , 'WP' , 'WDT' }
229+ # WH-words and common auxiliaries that follow them in real questions
230+ wh_words = {'what' , 'who' , 'where' , 'when' , 'why' , 'how' , 'which' }
231+ wh_followers = {
232+ 'what' : {'are' , 'is' , 'do' , 'does' , 'can' , 'should' , 'might' },
233+ 'who' : {'is' , 'are' , 'was' , 'can' , 'should' },
234+ 'where' : {'is' , 'are' , 'can' , 'should' },
235+ 'when' : {'is' , 'are' , 'can' , 'should' },
236+ 'why' : {'is' , 'are' , 'do' , 'does' , 'can' , 'might' , 'would' },
237+ 'how' : {'is' , 'are' , 'do' , 'does' , 'can' , 'should' , 'would' },
238+ 'which' : {'is' , 'are' , 'was' , 'can' , 'should' }
239+ }
240+ # Auxiliaries that typically initiate Yes/No questions
241+ yesno_aux = {'do' , 'does' , 'did' , 'have' , 'has' , 'had' ,
242+ # 'can', 'could', 'will', 'would',
243+ 'may' , 'might' , 'shall' , 'should' ,
244+ 'is' , 'are' , 'was' , 'were' , 'am' }
245+ # Pronouns that often follow auxiliaries in Yes/No questions
246+ pronoun_followers = {'i' , 'you' , 'we' , 'he' , 'she' , 'they' , 'it' }
247+ # filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'}
248+
249+ wh_count = 0
250+ yesno_count = 0
251+ counted_sentences = set ()
252+ for sent in doc .sents :
253+ sent_text = sent .text .strip ()
254+ sent_tokens = list (sent )
255+ if not sent_tokens :
256+ continue
257+ # Method 1: Find question sentences by checking for '?' at end
258+ if sent_text .endswith ('?' ):
259+ # try to find the first WH-word in the sentence
260+ wh = False
261+ for token in sent_tokens :
262+ if token .text .lower () in wh_words and token .tag_ in search_tags and token .dep_ not in {"relcl" , "acl" }\
263+ and token .i < sent .root .i :
264+ wh = True
265+ break
266+ if wh :
267+ wh_count += 1
268+ else :
269+ # Fallback: no WH in the sentence → treat as Yes/No question
270+ yesno_count += 1
271+ counted_sentences .add (sent .start )
272+ continue
273+ # Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access
274+ for i in range (len (sent_tokens ) - 1 ):
275+ tok1 = sent_tokens [i ]
276+ tok2 = sent_tokens [i + 1 ]
277+ t1_lower = tok1 .text .lower ()
278+ t2_lower = tok2 .text .lower ()
279+ if sent .start in counted_sentences :
280+ break # already counted
281+ # WH pattern
282+ if t1_lower in wh_words and t2_lower in wh_followers .get (t1_lower , set ()):
283+ wh_count += 1
284+ counted_sentences .add (sent .start )
285+ break
286+ # Yes/No pattern
287+ if t1_lower in yesno_aux and t2_lower in pronoun_followers :
288+ yesno_count += 1
289+ counted_sentences .add (sent .start )
290+ break
291+ return yesno_count , wh_count
292+ # sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
293+ # all_qs = len(sentences)
294+ # n = 0
295+ # for i in range(len(sentences)):
296+ # whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
297+ # if len(whq) > 0:
298+ # n += 1
299+ # return all_qs - n, n
244300
245301
246302def word_start (keywords , doc ):
0 commit comments