diff --git a/dialectid/__init__.py b/dialectid/__init__.py index 344bb18..8fdbb91 100644 --- a/dialectid/__init__.py +++ b/dialectid/__init__.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = '0.1.4' +__version__ = '0.1.5' # from dialectid.text_repr import BoW, SeqTM from dialectid.model import DialectId, BoW diff --git a/dialectid/model.py b/dialectid/model.py index d87bd82..db18741 100644 --- a/dialectid/model.py +++ b/dialectid/model.py @@ -103,7 +103,8 @@ def decision_function(self, texts: list): X = np.c_[-X[:, 0], X[:, 0]] return X - def positive(self, texts: list): + def positive(self, texts: list, + margin: float=0): """Positive classes""" X = self.transform(texts) X_df = X @@ -115,7 +116,7 @@ def positive(self, texts: list): X = X_df output = [] labels = self.countries - for mask, value in zip(X_df > 0, X): + for mask, value in zip(X_df > margin, X): _ = {str(k): v for k, v in zip(labels[mask], value[mask])} output.append(_) return output diff --git a/dialectid/tests/test_model.py b/dialectid/tests/test_model.py index 59ddd89..70838f2 100644 --- a/dialectid/tests/test_model.py +++ b/dialectid/tests/test_model.py @@ -146,6 +146,8 @@ def test_DialectId_positive(): dial = DialectId(lang='es') output1 = dial.positive(['comiendo unos tacos']) assert 'mx' in output1[0] and 'gt' in output1[0] + output2 = dial.positive(['comiendo unos tacos'], margin=1) + assert len(output1[0]) > len(output2[0]) dial.probability = True output2 = dial.positive(['comiendo unos tacos']) assert len(output2) == len(output1) diff --git a/quarto/dialectid.qmd b/quarto/dialectid.qmd index 0bb4ef2..fe7da0f 100644 --- a/quarto/dialectid.qmd +++ b/quarto/dialectid.qmd @@ -437,8 +437,6 @@ detect.predict(['comiendo unos tacos', ``` ::: -## Column - ::: {.card title='Countries' .flow} The available dialects for each language can be identified in the attribute `countries`, as seen in the following snippet for Spanish. @@ -452,6 +450,8 @@ detect.countries ``` ::: +## Column + ::: {.card title='Decision Function' .flow} One might be interested in all the countries from which the speaker could come. To facilitate this, one can use the `decision_function` method. DialectId uses linear Support Vector Machines (SVM) as classifiers; consequently, the positive values in the `decision_function` are interpreted as belonging to the positive class, i.e., a particular country. The following code exemplifies this idea: the first two lines import and instantiate the `DialectId` class in Spanish. The third line computes the decision-function values; it returns a two-dimensional array where the first dimension corresponds to the number of texts. In this case, it keeps only the decision-function values, where the positive values indicate the presence of the particular country. The fourth line sorts the values where the highest value is the first element. The fifth line retrieves the country and its associated decision-function values, considering only those countries with positive values. @@ -468,6 +468,20 @@ index = df.argsort()[::-1] ``` ::: +::: {.card title='Positive class' .flow} +In the case where one is interested in the positive classes, as described in the previous example, `DialectId` implements the `DialectId.positive` method to retrieve the positive labels in a list of texts, as shown in the following example. + +```{python} +#| echo: true +#| label: positive-default + +from dialectid import DialectId +detect = DialectId(lang='es') +pos = detect.positive(['acompaƱando el asado con un buen vino'])[0] +pos +``` +::: + ::: {.card title='Probability' .flow} In some situations, one is interested in the probability instead of the decision-function values of a linear SVM. The probability can be computed using the `predict_proba` method. The following code exemplifies this idea: the first line imports the `DialectId` class as in previous examples. The second line differs from the last example in that the parameter `probability` is set to true. The rest of the lines are almost equivalent to the previous example. @@ -482,6 +496,18 @@ index = prob.argsort()[::-1] [(detect.countries[i], prob[i]) for i in index[:4]] ``` + +The `DialectId.positive` method can also be used when one is interested in the probabilities of the positive classes, as shown in the following lines. + +```{python} +#| echo: true +#| label: positive-probability + +from dialectid import DialectId +detect = DialectId(lang='es', probability=True) +pos = detect.positive(['acompaƱando el asado con un buen vino'])[0] +pos +``` ::: # Corpora