diff --git a/orangecontrib/prototypes/widgets/icons/TextToColumns.svg b/orangecontrib/prototypes/widgets/icons/Split.svg similarity index 100% rename from orangecontrib/prototypes/widgets/icons/TextToColumns.svg rename to orangecontrib/prototypes/widgets/icons/Split.svg diff --git a/orangecontrib/prototypes/widgets/owtexttocolumns.py b/orangecontrib/prototypes/widgets/owsplit.py similarity index 58% rename from orangecontrib/prototypes/widgets/owtexttocolumns.py rename to orangecontrib/prototypes/widgets/owsplit.py index 3507171b..05fe042a 100644 --- a/orangecontrib/prototypes/widgets/owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/owsplit.py @@ -9,7 +9,8 @@ from Orange.widgets.widget import OWWidget, Msg, Output, Input from Orange.widgets.utils.itemmodels import DomainModel from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.data import Table, Domain, DiscreteVariable, StringVariable +from Orange.data import \ + Table, Domain, DiscreteVariable, StringVariable, ContinuousVariable from Orange.data.util import SharedComputeValue, get_unique_names from orangewidget.settings import Setting @@ -20,13 +21,25 @@ def get_substrings(values, delimiter): - {""}) -class SplitColumn: +class SplitColumnBase: def __init__(self, data, attr, delimiter): self.attr = attr self.delimiter = delimiter column = set(data.get_column(self.attr)) self.new_values = tuple(get_substrings(column, self.delimiter)) + def __eq__(self, other): + return self.attr == other.attr \ + and self.delimiter == other.delimiter \ + and self.new_values == other.new_values + + def __hash__(self): + return hash((self.attr, self.delimiter, self.new_values)) + + +class SplitColumnOneHot(SplitColumnBase): + InheritEq = True + def __call__(self, data): column = data.get_column(self.attr) values = [{ss.strip() for ss in s.split(self.delimiter)} @@ -35,62 +48,83 @@ def __call__(self, data): dtype=int) for v in self.new_values} - def __eq__(self, other): - return self.attr == other.attr \ - and self.delimiter == other.delimiter \ - and self.new_values == other.new_values - def __hash__(self): - return hash((self.attr, self.delimiter, self.new_values)) +class SplitColumnCounts(SplitColumnBase): + InheritEq = True + + def __call__(self, data): + column = data.get_column(self.attr) + values = [[ss.strip() for ss in s.split(self.delimiter)] + for s in column] + return {v: np.array([xs.count(v) for xs in values], dtype=float) + for v in self.new_values} -class OneHotStrings(SharedComputeValue): +class StringEncodingBase(SharedComputeValue): def __init__(self, fn, new_feature): super().__init__(fn) self.new_feature = new_feature + def __eq__(self, other): + return super().__eq__(other) and self.new_feature == other.new_feature + + def __hash__(self): + return super().__hash__() ^ hash(self.new_feature) + + +class OneHotStrings(StringEncodingBase): + InheritEq = True + def compute(self, data, shared_data): indices = shared_data[self.new_feature] col = np.zeros(len(data)) col[indices] = 1 return col - def __eq__(self, other): - return super().__eq__(other) and self.new_feature == other.new_feature - def __hash__(self): - return super().__hash__() ^ hash(self.new_feature) +class CountStrings(StringEncodingBase): + InheritEq = True + + def compute(self, data, shared_data): + return shared_data[self.new_feature] -class OneHotDiscrete: - def __init__(self, variable, delimiter, value): +class DiscreteEncoding: + def __init__(self, variable, delimiter, onehot, value): self.variable = variable - self.value = value self.delimiter = delimiter + self.onehot = onehot + self.value = value def __call__(self, data): column = data.get_column(self.variable).astype(float) col = np.zeros(len(column)) col[np.isnan(column)] = np.nan for val_idx, value in enumerate(self.variable.values): - if self.value in value.split(self.delimiter): - col[column == val_idx] = 1 + parts = value.split(self.delimiter) + if self.onehot: + col[column == val_idx] = int(self.value in parts) + else: + col[column == val_idx] = parts.count(self.value) return col def __eq__(self, other): return self.variable == other.variable \ and self.value == other.value \ - and self.delimiter == other.delimiter + and self.delimiter == other.delimiter \ + and self.onehot == other.onehot def __hash__(self): - return hash((self.variable, self.value, self.delimiter)) + return hash((self.variable, self.value, self.delimiter, self.onehot)) -class OWTextToColumns(OWWidget): - name = "Text to Columns" +class OWSplit(OWWidget): + name = "Split" description = "Split text or categorical variables into binary indicators" - icon = "icons/TextToColumns.svg" - keywords = ["split"] + icon = "icons/Split.svg" + keywords = ["text to columns", "word encoding", "questionnaire", "survey", + "term", "word presence", "word counts", "categorical encoding", + "indicator variables"] priority = 700 replaces = ["orangecontrib.prototypes.widgets.owsplit.OWSplit"] @@ -106,9 +140,13 @@ class Warning(OWWidget.Warning): want_main_area = False resizing_enabled = False + Categorical, Numerical, Counts = range(3) + OutputLabels = ("Categorical (No, Yes)", "Numerical (0, 1)", "Counts") + settingsHandler = DomainContextHandler() attribute = ContextSetting(None) delimiter = ContextSetting(";") + output_type = ContextSetting(Categorical) auto_apply = Setting(True) def __init__(self): @@ -123,8 +161,14 @@ def __init__(self): model=DomainModel(valid_types=(StringVariable, DiscreteVariable))) gui.lineEdit( - variable_select_box, self, "delimiter", - orientation=Qt.Horizontal, callback=self.apply.deferred) + variable_select_box, self, "delimiter", "Delimiter: ", + orientation=Qt.Horizontal, callback=self.apply.deferred, + controlWidth=20).box.layout().addStretch(1) + + gui.radioButtonsInBox( + self.controlArea, self, "output_type", self.OutputLabels, + box="Output Values", + callback=self.apply.deferred) gui.auto_apply(self.buttonsArea, self, commit=self.apply) @@ -150,20 +194,8 @@ def apply(self): self.Outputs.data.send(None) return var = self.data.domain[self.attribute] - - if var.is_discrete: - values = get_substrings(var.values, self.delimiter) - computer = partial(OneHotDiscrete, var, self.delimiter) - else: - sc = SplitColumn(self.data, var, self.delimiter) - values = sc.new_values - computer = partial(OneHotStrings, sc) - names = get_unique_names(self.data.domain, values, equal_numbers=False) - - new_columns = tuple(DiscreteVariable( - name, values=("0", "1"), compute_value=computer(value) - ) for value, name in zip(values, names)) - + values, computer = self._get_compute_value(var) + new_columns = self._get_new_columns(values, computer) new_domain = Domain( self.data.domain.attributes + new_columns, self.data.domain.class_vars, self.data.domain.metas @@ -171,7 +203,35 @@ def apply(self): extended_data = self.data.transform(new_domain) self.Outputs.data.send(extended_data) + def _get_compute_value(self, var): + if var.is_discrete: + values = get_substrings(var.values, self.delimiter) + computer = partial( + DiscreteEncoding, + var, self.delimiter, self.output_type != self.Counts) + else: + if self.output_type == self.Counts: + sc = SplitColumnCounts(self.data, var, self.delimiter) + computer = partial(CountStrings, sc) + else: + sc = SplitColumnOneHot(self.data, var, self.delimiter) + computer = partial(OneHotStrings, sc) + values = sc.new_values + return values, computer + + def _get_new_columns(self, values, computer): + names = get_unique_names(self.data.domain, values, equal_numbers=False) + if self.output_type == self.Categorical: + return tuple( + DiscreteVariable( + name, ("No", "Yes"), compute_value=computer(value)) + for value, name in zip(values, names)) + else: + return tuple( + ContinuousVariable( + name, compute_value=computer(value)) + for value, name in zip(values, names)) + if __name__ == "__main__": # pragma: no cover - WidgetPreview(OWTextToColumns).run(Table.from_file( - "tests/orange-in-education.tab")) + WidgetPreview(OWSplit).run(Table.from_file("tests/orange-in-education.tab")) diff --git a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py b/orangecontrib/prototypes/widgets/tests/test_owsplit.py similarity index 56% rename from orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py rename to orangecontrib/prototypes/widgets/tests/test_owsplit.py index c685f090..34aa6f87 100644 --- a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/tests/test_owsplit.py @@ -7,20 +7,30 @@ from Orange.data import Table, StringVariable, Domain, DiscreteVariable from Orange.widgets.tests.base import WidgetTest -from orangecontrib.prototypes.widgets.owtexttocolumns import \ - OWTextToColumns, SplitColumn, get_substrings, OneHotStrings, OneHotDiscrete +from orangecontrib.prototypes.widgets.owsplit import \ + OWSplit, SplitColumnOneHot, get_substrings, OneHotStrings, \ + DiscreteEncoding, SplitColumnCounts, CountStrings class TestComputation(unittest.TestCase): def setUp(self): - domain = Domain([DiscreteVariable("x", values=("a c d", "bb d"))], None, - [StringVariable("foo"), StringVariable("bar")]) + domain = Domain( + [ + DiscreteVariable("x", values=("a c d c bb bb bb", "bb d")) + ], + None, + [ + StringVariable("foo"), + StringVariable("bar") + ]) self.data = Table.from_numpy( domain, - np.array([1, 0, np.nan])[:, None], None, - [["a,bbb,d", "e;f o"], ["", "f o"], ["bbb,d", "e;a;o"]] + np.array([[1], [0], [np.nan]]), None, + [["a,bbb,d,a,a", "e;f o"], ["", "f o"], ["bbb,d,bbb", "e;a;o"]] ) + +class TestSplitColumn(TestComputation): def test_get_string_values(self): np.testing.assert_equal( set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, " ")), @@ -29,8 +39,8 @@ def test_get_string_values(self): set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, ",")), {"a bc", "d", "e", "f", "a t", "t"}) - def test_split_column(self): - sc = SplitColumn(self.data, self.data.domain.metas[0], ",") + def test_split_column_one_hot(self): + sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") shared = sc(self.data) self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) self.assertEqual(set(shared), set(sc.new_values)) @@ -38,7 +48,7 @@ def test_split_column(self): np.testing.assert_equal(shared["bbb"], [0, 2]) np.testing.assert_equal(shared["d"], [0, 2]) - sc = SplitColumn(self.data, self.data.domain.metas[1], ";") + sc = SplitColumnOneHot(self.data, self.data.domain.metas[1], ";") shared = sc(self.data) self.assertEqual(set(sc.new_values), {"a", "e", "f o", "o"}) self.assertEqual(set(shared), set(sc.new_values)) @@ -47,8 +57,17 @@ def test_split_column(self): np.testing.assert_equal(shared["f o"], [0, 1]) np.testing.assert_equal(shared["o"], [2]) + def test_split_column_counts(self): + sc = SplitColumnCounts(self.data, self.data.domain.metas[0], ",") + shared = sc(self.data) + self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) + self.assertEqual(set(shared), set(sc.new_values)) + np.testing.assert_equal(shared["a"], [3, 0, 0]) + np.testing.assert_equal(shared["bbb"], [1, 0, 2]) + np.testing.assert_equal(shared["d"], [1, 0, 1]) + def test_no_known_values(self): - sc = SplitColumn(self.data, self.data.domain.metas[0], ",") + sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") data = Table.from_numpy( self.data.domain, np.zeros((3, 1)), None, np.array([["x"] * 2] * 3)) @@ -58,9 +77,10 @@ def test_no_known_values(self): oh = OneHotStrings(sc, attr) np.testing.assert_equal(oh(data), [0, 0, 0]) +class TestStringEncoding(TestComputation): def test_one_hot_strings(self): attr = self.data.domain.metas[0] - sc = SplitColumn(self.data, attr, ",") + sc = SplitColumnOneHot(self.data, attr, ",") oh = OneHotStrings(sc, "a") np.testing.assert_equal(oh(self.data), [1, 0, 0]) @@ -74,37 +94,64 @@ def test_one_hot_strings(self): np.array(["bbb,x,y", "", "bbb", "bbb,a", "foo"])[:, None]) np.testing.assert_equal(oh(data), [1, 0, 1, 1, 0]) + def test_count_strings(self): + attr = self.data.domain.metas[0] + sc = SplitColumnCounts(self.data, attr, ",") + + oh = CountStrings(sc, "a") + np.testing.assert_equal(oh(self.data), [3, 0, 0]) + + oh = CountStrings(sc, "bbb") + np.testing.assert_equal(oh(self.data), [1, 0, 2]) + + oh = CountStrings(sc, "d") + np.testing.assert_equal(oh(self.data), [1, 0, 1]) + + +class TestDiscreteEncoding(TestComputation): def test_one_hot_discrete(self): attr = self.data.domain.attributes[0] - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) - oh = OneHotDiscrete(attr, " ", "d") + oh = DiscreteEncoding(attr, " ", True, "d") np.testing.assert_equal(oh(self.data), [1, 1, np.nan]) data = Table.from_numpy( Domain([attr], None), np.array([1, 0, 1, 0, np.nan])[:, None]) - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(data), [0, 1, 0, 1, np.nan]) - oh = OneHotDiscrete(attr, " ", "d") + oh = DiscreteEncoding(attr, " ", True, "d") np.testing.assert_equal(oh(data), [1, 1, 1, 1, np.nan]) + def test_discrete_counts(self): + attr = self.data.domain.attributes[0] + + oh = DiscreteEncoding(attr, " ", False, "a") + np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) + oh = DiscreteEncoding(attr, " ", False, "bb") + np.testing.assert_equal(oh(self.data), [1, 3, np.nan]) + with self.data.unlocked(): + self.data.X[2, 0] = 0 + np.testing.assert_equal(oh(self.data), [1, 3, 3]) + def test_discrete_metas(self): attr = DiscreteVariable("x", values=("a c d", "bb d")) domain = Domain([], None, [attr]) data = Table.from_numpy(domain, np.zeros((3, 0)), None, np.array([1, 0, np.nan])[:, None]) - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(data), [0, 1, np.nan]) -class TestOWTextToColumns(WidgetTest): + +class TestOWSplit(WidgetTest): def setUp(self): - self.widget = self.create_widget(OWTextToColumns) + self.widget = self.create_widget(OWSplit) test_path = os.path.dirname(os.path.abspath(__file__)) self.data = Table.from_file(os.path.join(test_path, "orange-in-education.tab")) self._create_simple_corpus() @@ -124,8 +171,8 @@ def _create_simple_corpus(self) -> None: metas = np.array( [ ["foo,"], - ["bar,baz "], - ["foo,bar"], + ["bar,baz , bar, bar"], + ["foo,bar, foo"], [""], ] ) @@ -198,22 +245,83 @@ def test_output_string(self): [0, 0, 0]]) def test_output_discrete(self): - self.widget.delimiter = " " - attr = DiscreteVariable("x", values=("bar foo", "bar baz", "crux")) + w = self.widget + w.delimiter = " " + w.output_type = w.Categorical + + attr = DiscreteVariable( + "x", + values=("bar foo bar bar foo foo foo", "bar baz", "crux crux")) data = Table.from_numpy( Domain([attr], None), np.array([1, 1, 0, 1, 2, np.nan])[:, None], None) - self.send_signal(self.widget.Inputs.data, data) - out = self.get_output(self.widget.Outputs.data) + + counts = np.array([[1, 1, 0, 0], + [1, 1, 0, 0], + [3, 0, 0, 4], + [1, 1, 0, 0], + [0, 0, 2, 0], + [np.nan, np.nan, np.nan, np.nan]]) + exp_hot = np.hstack((data.X, np.vstack((counts[:-1] > 0, [[np.nan] * 4])))) + + self.send_signal(w.Inputs.data, data) + out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["x", "bar", "baz", "crux", "foo"]) - np.testing.assert_equal(out.X, - [[1, 1, 1, 0, 0], - [1, 1, 1, 0, 0], - [0, 1, 0, 0, 1], - [1, 1, 1, 0, 0], - [2, 0, 0, 1, 0], - [np.nan, np.nan, np.nan, np.nan, np.nan]]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("No", "Yes")) + np.testing.assert_equal(out.X, exp_hot) + + w.controls.output_type.buttons[w.Numerical].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["x", "bar", "baz", "crux", "foo"]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, exp_hot) + + w.controls.output_type.buttons[w.Counts].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["x", "bar", "baz", "crux", "foo"]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal( + out.X, + np.hstack((data.X, np.vstack((counts[:-1], [[np.nan] * 4]))))) + + def test_output_types_string(self): + w = self.widget + w.delimiter = "," + w.output_type = w.Categorical + + self.send_signal(w.Inputs.data, self.small_table) + counts = np.array([[0, 0, 1], [3, 1, 0], [1, 0, 2], [0, 0, 0]]) + + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("No", "Yes")) + np.testing.assert_equal(out.X, counts > 0) + + w.controls.output_type.buttons[w.Numerical].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, counts > 0) + + w.controls.output_type.buttons[w.Counts].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, counts) if __name__ == "__main__":