From a0a60f25b9d646c62a5b85d749e4de16a8c75fb9 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 15 Aug 2025 16:06:49 +0200 Subject: [PATCH 1/3] Text To Columns: Allow setting output format --- .../prototypes/widgets/owtexttocolumns.py | 136 +++++++++---- .../widgets/tests/test_owtexttocolumns.py | 182 +++++++++++++++--- 2 files changed, 254 insertions(+), 64 deletions(-) diff --git a/orangecontrib/prototypes/widgets/owtexttocolumns.py b/orangecontrib/prototypes/widgets/owtexttocolumns.py index 3507171b..a8b239fb 100644 --- a/orangecontrib/prototypes/widgets/owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/owtexttocolumns.py @@ -9,7 +9,8 @@ from Orange.widgets.widget import OWWidget, Msg, Output, Input from Orange.widgets.utils.itemmodels import DomainModel from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.data import Table, Domain, DiscreteVariable, StringVariable +from Orange.data import \ + Table, Domain, DiscreteVariable, StringVariable, ContinuousVariable from Orange.data.util import SharedComputeValue, get_unique_names from orangewidget.settings import Setting @@ -20,13 +21,25 @@ def get_substrings(values, delimiter): - {""}) -class SplitColumn: +class SplitColumnBase: def __init__(self, data, attr, delimiter): self.attr = attr self.delimiter = delimiter column = set(data.get_column(self.attr)) self.new_values = tuple(get_substrings(column, self.delimiter)) + def __eq__(self, other): + return self.attr == other.attr \ + and self.delimiter == other.delimiter \ + and self.new_values == other.new_values + + def __hash__(self): + return hash((self.attr, self.delimiter, self.new_values)) + + +class SplitColumnOneHot(SplitColumnBase): + InheritEq = True + def __call__(self, data): column = data.get_column(self.attr) values = [{ss.strip() for ss in s.split(self.delimiter)} @@ -35,55 +48,74 @@ def __call__(self, data): dtype=int) for v in self.new_values} - def __eq__(self, other): - return self.attr == other.attr \ - and self.delimiter == other.delimiter \ - and self.new_values == other.new_values - def __hash__(self): - return hash((self.attr, self.delimiter, self.new_values)) +class SplitColumnCounts(SplitColumnBase): + InheritEq = True + + def __call__(self, data): + column = data.get_column(self.attr) + values = [[ss.strip() for ss in s.split(self.delimiter)] + for s in column] + return {v: np.array([xs.count(v) for xs in values], dtype=float) + for v in self.new_values} -class OneHotStrings(SharedComputeValue): +class StringEncodingBase(SharedComputeValue): def __init__(self, fn, new_feature): super().__init__(fn) self.new_feature = new_feature + def __eq__(self, other): + return super().__eq__(other) and self.new_feature == other.new_feature + + def __hash__(self): + return super().__hash__() ^ hash(self.new_feature) + + +class OneHotStrings(StringEncodingBase): + InheritEq = True + def compute(self, data, shared_data): indices = shared_data[self.new_feature] col = np.zeros(len(data)) col[indices] = 1 return col - def __eq__(self, other): - return super().__eq__(other) and self.new_feature == other.new_feature - def __hash__(self): - return super().__hash__() ^ hash(self.new_feature) +class CountStrings(StringEncodingBase): + InheritEq = True + + def compute(self, data, shared_data): + return shared_data[self.new_feature] -class OneHotDiscrete: - def __init__(self, variable, delimiter, value): +class DiscreteEncoding: + def __init__(self, variable, delimiter, onehot, value): self.variable = variable - self.value = value self.delimiter = delimiter + self.onehot = onehot + self.value = value def __call__(self, data): column = data.get_column(self.variable).astype(float) col = np.zeros(len(column)) col[np.isnan(column)] = np.nan for val_idx, value in enumerate(self.variable.values): - if self.value in value.split(self.delimiter): - col[column == val_idx] = 1 + parts = value.split(self.delimiter) + if self.onehot: + col[column == val_idx] = int(self.value in parts) + else: + col[column == val_idx] = parts.count(self.value) return col def __eq__(self, other): return self.variable == other.variable \ and self.value == other.value \ - and self.delimiter == other.delimiter + and self.delimiter == other.delimiter \ + and self.onehot == other.onehot def __hash__(self): - return hash((self.variable, self.value, self.delimiter)) + return hash((self.variable, self.value, self.delimiter, self.onehot)) class OWTextToColumns(OWWidget): @@ -106,9 +138,18 @@ class Warning(OWWidget.Warning): want_main_area = False resizing_enabled = False + NoYes, Categorical01, Numerical01, Counts = range(4) + OutputLabels = ( + "No / Yes", + "0 / 1 (as categorical)", + "0 / 1 (as numbers)", + "Counts" + ) + settingsHandler = DomainContextHandler() attribute = ContextSetting(None) delimiter = ContextSetting(";") + output_type = ContextSetting(NoYes) auto_apply = Setting(True) def __init__(self): @@ -123,8 +164,14 @@ def __init__(self): model=DomainModel(valid_types=(StringVariable, DiscreteVariable))) gui.lineEdit( - variable_select_box, self, "delimiter", - orientation=Qt.Horizontal, callback=self.apply.deferred) + variable_select_box, self, "delimiter", "Delimiter: ", + orientation=Qt.Horizontal, callback=self.apply.deferred, + controlWidth=20).box.layout().addStretch(1) + + gui.radioButtonsInBox( + self.controlArea, self, "output_type", self.OutputLabels, + box="Output", + callback=self.apply.deferred) gui.auto_apply(self.buttonsArea, self, commit=self.apply) @@ -150,20 +197,8 @@ def apply(self): self.Outputs.data.send(None) return var = self.data.domain[self.attribute] - - if var.is_discrete: - values = get_substrings(var.values, self.delimiter) - computer = partial(OneHotDiscrete, var, self.delimiter) - else: - sc = SplitColumn(self.data, var, self.delimiter) - values = sc.new_values - computer = partial(OneHotStrings, sc) - names = get_unique_names(self.data.domain, values, equal_numbers=False) - - new_columns = tuple(DiscreteVariable( - name, values=("0", "1"), compute_value=computer(value) - ) for value, name in zip(values, names)) - + values, computer = self._get_compute_value(var) + new_columns = self._get_new_columns(values, computer) new_domain = Domain( self.data.domain.attributes + new_columns, self.data.domain.class_vars, self.data.domain.metas @@ -171,6 +206,35 @@ def apply(self): extended_data = self.data.transform(new_domain) self.Outputs.data.send(extended_data) + def _get_compute_value(self, var): + if var.is_discrete: + values = get_substrings(var.values, self.delimiter) + computer = partial( + DiscreteEncoding, + var, self.delimiter, self.output_type != self.Counts) + else: + if self.output_type == self.Counts: + sc = SplitColumnCounts(self.data, var, self.delimiter) + computer = partial(CountStrings, sc) + else: + sc = SplitColumnOneHot(self.data, var, self.delimiter) + computer = partial(OneHotStrings, sc) + values = sc.new_values + return values, computer + + def _get_new_columns(self, values, computer): + names = get_unique_names(self.data.domain, values, equal_numbers=False) + if self.output_type in (self.Numerical01, self.Counts): + return tuple( + ContinuousVariable(name, compute_value=computer(value)) + for value, name in zip(values, names)) + else: + varvalues = ("0", "1") if self.output_type == self.Categorical01 \ + else ("No", "Yes") + return tuple(DiscreteVariable( + name, varvalues, compute_value=computer(value) + ) for value, name in zip(values, names)) + if __name__ == "__main__": # pragma: no cover WidgetPreview(OWTextToColumns).run(Table.from_file( diff --git a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py b/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py index c685f090..80cb8d8f 100644 --- a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py @@ -8,19 +8,29 @@ from Orange.widgets.tests.base import WidgetTest from orangecontrib.prototypes.widgets.owtexttocolumns import \ - OWTextToColumns, SplitColumn, get_substrings, OneHotStrings, OneHotDiscrete + OWTextToColumns, SplitColumnOneHot, get_substrings, OneHotStrings, \ + DiscreteEncoding, SplitColumnCounts, CountStrings class TestComputation(unittest.TestCase): def setUp(self): - domain = Domain([DiscreteVariable("x", values=("a c d", "bb d"))], None, - [StringVariable("foo"), StringVariable("bar")]) + domain = Domain( + [ + DiscreteVariable("x", values=("a c d c bb bb bb", "bb d")) + ], + None, + [ + StringVariable("foo"), + StringVariable("bar") + ]) self.data = Table.from_numpy( domain, - np.array([1, 0, np.nan])[:, None], None, - [["a,bbb,d", "e;f o"], ["", "f o"], ["bbb,d", "e;a;o"]] + np.array([[1], [0], [np.nan]]), None, + [["a,bbb,d,a,a", "e;f o"], ["", "f o"], ["bbb,d,bbb", "e;a;o"]] ) + +class TestSplitColumn(TestComputation): def test_get_string_values(self): np.testing.assert_equal( set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, " ")), @@ -29,8 +39,8 @@ def test_get_string_values(self): set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, ",")), {"a bc", "d", "e", "f", "a t", "t"}) - def test_split_column(self): - sc = SplitColumn(self.data, self.data.domain.metas[0], ",") + def test_split_column_one_hot(self): + sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") shared = sc(self.data) self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) self.assertEqual(set(shared), set(sc.new_values)) @@ -38,7 +48,7 @@ def test_split_column(self): np.testing.assert_equal(shared["bbb"], [0, 2]) np.testing.assert_equal(shared["d"], [0, 2]) - sc = SplitColumn(self.data, self.data.domain.metas[1], ";") + sc = SplitColumnOneHot(self.data, self.data.domain.metas[1], ";") shared = sc(self.data) self.assertEqual(set(sc.new_values), {"a", "e", "f o", "o"}) self.assertEqual(set(shared), set(sc.new_values)) @@ -47,8 +57,17 @@ def test_split_column(self): np.testing.assert_equal(shared["f o"], [0, 1]) np.testing.assert_equal(shared["o"], [2]) + def test_split_column_counts(self): + sc = SplitColumnCounts(self.data, self.data.domain.metas[0], ",") + shared = sc(self.data) + self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) + self.assertEqual(set(shared), set(sc.new_values)) + np.testing.assert_equal(shared["a"], [3, 0, 0]) + np.testing.assert_equal(shared["bbb"], [1, 0, 2]) + np.testing.assert_equal(shared["d"], [1, 0, 1]) + def test_no_known_values(self): - sc = SplitColumn(self.data, self.data.domain.metas[0], ",") + sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") data = Table.from_numpy( self.data.domain, np.zeros((3, 1)), None, np.array([["x"] * 2] * 3)) @@ -58,9 +77,10 @@ def test_no_known_values(self): oh = OneHotStrings(sc, attr) np.testing.assert_equal(oh(data), [0, 0, 0]) +class TestStringEncoding(TestComputation): def test_one_hot_strings(self): attr = self.data.domain.metas[0] - sc = SplitColumn(self.data, attr, ",") + sc = SplitColumnOneHot(self.data, attr, ",") oh = OneHotStrings(sc, "a") np.testing.assert_equal(oh(self.data), [1, 0, 0]) @@ -74,34 +94,61 @@ def test_one_hot_strings(self): np.array(["bbb,x,y", "", "bbb", "bbb,a", "foo"])[:, None]) np.testing.assert_equal(oh(data), [1, 0, 1, 1, 0]) + def test_count_strings(self): + attr = self.data.domain.metas[0] + sc = SplitColumnCounts(self.data, attr, ",") + + oh = CountStrings(sc, "a") + np.testing.assert_equal(oh(self.data), [3, 0, 0]) + + oh = CountStrings(sc, "bbb") + np.testing.assert_equal(oh(self.data), [1, 0, 2]) + + oh = CountStrings(sc, "d") + np.testing.assert_equal(oh(self.data), [1, 0, 1]) + + +class TestDiscreteEncoding(TestComputation): def test_one_hot_discrete(self): attr = self.data.domain.attributes[0] - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) - oh = OneHotDiscrete(attr, " ", "d") + oh = DiscreteEncoding(attr, " ", True, "d") np.testing.assert_equal(oh(self.data), [1, 1, np.nan]) data = Table.from_numpy( Domain([attr], None), np.array([1, 0, 1, 0, np.nan])[:, None]) - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(data), [0, 1, 0, 1, np.nan]) - oh = OneHotDiscrete(attr, " ", "d") + oh = DiscreteEncoding(attr, " ", True, "d") np.testing.assert_equal(oh(data), [1, 1, 1, 1, np.nan]) + def test_discrete_counts(self): + attr = self.data.domain.attributes[0] + + oh = DiscreteEncoding(attr, " ", False, "a") + np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) + oh = DiscreteEncoding(attr, " ", False, "bb") + np.testing.assert_equal(oh(self.data), [1, 3, np.nan]) + with self.data.unlocked(): + self.data.X[2, 0] = 0 + np.testing.assert_equal(oh(self.data), [1, 3, 3]) + def test_discrete_metas(self): attr = DiscreteVariable("x", values=("a c d", "bb d")) domain = Domain([], None, [attr]) data = Table.from_numpy(domain, np.zeros((3, 0)), None, np.array([1, 0, np.nan])[:, None]) - oh = OneHotDiscrete(attr, " ", "a") + oh = DiscreteEncoding(attr, " ", True, "a") np.testing.assert_equal(oh(data), [0, 1, np.nan]) + class TestOWTextToColumns(WidgetTest): def setUp(self): self.widget = self.create_widget(OWTextToColumns) @@ -124,8 +171,8 @@ def _create_simple_corpus(self) -> None: metas = np.array( [ ["foo,"], - ["bar,baz "], - ["foo,bar"], + ["bar,baz , bar, bar"], + ["foo,bar, foo"], [""], ] ) @@ -198,22 +245,101 @@ def test_output_string(self): [0, 0, 0]]) def test_output_discrete(self): - self.widget.delimiter = " " - attr = DiscreteVariable("x", values=("bar foo", "bar baz", "crux")) + w = self.widget + w.delimiter = " " + w.output_type = w.Categorical01 + + attr = DiscreteVariable( + "x", + values=("bar foo bar bar foo foo foo", "bar baz", "crux crux")) data = Table.from_numpy( Domain([attr], None), np.array([1, 1, 0, 1, 2, np.nan])[:, None], None) - self.send_signal(self.widget.Inputs.data, data) - out = self.get_output(self.widget.Outputs.data) + + counts = np.array([[1, 1, 0, 0], + [1, 1, 0, 0], + [3, 0, 0, 4], + [1, 1, 0, 0], + [0, 0, 2, 0], + [np.nan, np.nan, np.nan, np.nan]]) + exp_hot = np.hstack((data.X, np.vstack((counts[:-1] > 0, [[np.nan] * 4])))) + + self.send_signal(w.Inputs.data, data) + out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["x", "bar", "baz", "crux", "foo"]) - np.testing.assert_equal(out.X, - [[1, 1, 1, 0, 0], - [1, 1, 1, 0, 0], - [0, 1, 0, 0, 1], - [1, 1, 1, 0, 0], - [2, 0, 0, 1, 0], - [np.nan, np.nan, np.nan, np.nan, np.nan]]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("0", "1")) + np.testing.assert_equal(out.X, exp_hot) + + w.controls.output_type.buttons[w.NoYes].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["x", "bar", "baz", "crux", "foo"]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("No", "Yes")) + np.testing.assert_equal(out.X, exp_hot) + + w.controls.output_type.buttons[w.Numerical01].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["x", "bar", "baz", "crux", "foo"]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, exp_hot) + + w.controls.output_type.buttons[w.Counts].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["x", "bar", "baz", "crux", "foo"]) + for attr in out.domain.attributes[1:]: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal( + out.X, + np.hstack((data.X, np.vstack((counts[:-1], [[np.nan] * 4]))))) + + def test_output_types_string(self): + w = self.widget + w.delimiter = "," + w.output_type = w.Categorical01 + + self.send_signal(w.Inputs.data, self.small_table) + counts = np.array([[0, 0, 1], [3, 1, 0], [1, 0, 2], [0, 0, 0]]) + + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("0", "1")) + np.testing.assert_equal(out.X, counts > 0) + + w.controls.output_type.buttons[w.NoYes].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_discrete) + self.assertEqual(attr.values, ("No", "Yes")) + np.testing.assert_equal(out.X, counts > 0) + + w.controls.output_type.buttons[w.Numerical01].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, counts > 0) + + w.controls.output_type.buttons[w.Counts].click() + out = self.get_output(w.Outputs.data) + self.assertEqual([attr.name for attr in out.domain.attributes], + ["bar", "baz", "foo (1)"]) + for attr in out.domain.attributes: + self.assertTrue(attr.is_continuous) + np.testing.assert_equal(out.X, counts) if __name__ == "__main__": From 2c976c5d2ccc999c0f42b46c9d468a762dc0e842 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 22 Aug 2025 23:01:00 +0200 Subject: [PATCH 2/3] Text to Columns: Remove Categorical01 --- .../prototypes/widgets/owtexttocolumns.py | 27 ++++++++----------- .../widgets/tests/test_owtexttocolumns.py | 26 +++--------------- 2 files changed, 15 insertions(+), 38 deletions(-) diff --git a/orangecontrib/prototypes/widgets/owtexttocolumns.py b/orangecontrib/prototypes/widgets/owtexttocolumns.py index a8b239fb..46f9a582 100644 --- a/orangecontrib/prototypes/widgets/owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/owtexttocolumns.py @@ -138,18 +138,13 @@ class Warning(OWWidget.Warning): want_main_area = False resizing_enabled = False - NoYes, Categorical01, Numerical01, Counts = range(4) - OutputLabels = ( - "No / Yes", - "0 / 1 (as categorical)", - "0 / 1 (as numbers)", - "Counts" - ) + Categorical, Numerical, Counts = range(3) + OutputLabels = ("Categorical (No, Yes)", "Numerical (0, 1)", "Counts") settingsHandler = DomainContextHandler() attribute = ContextSetting(None) delimiter = ContextSetting(";") - output_type = ContextSetting(NoYes) + output_type = ContextSetting(Categorical) auto_apply = Setting(True) def __init__(self): @@ -170,7 +165,7 @@ def __init__(self): gui.radioButtonsInBox( self.controlArea, self, "output_type", self.OutputLabels, - box="Output", + box="Output Values", callback=self.apply.deferred) gui.auto_apply(self.buttonsArea, self, commit=self.apply) @@ -224,16 +219,16 @@ def _get_compute_value(self, var): def _get_new_columns(self, values, computer): names = get_unique_names(self.data.domain, values, equal_numbers=False) - if self.output_type in (self.Numerical01, self.Counts): + if self.output_type == self.Categorical: return tuple( - ContinuousVariable(name, compute_value=computer(value)) + DiscreteVariable( + name, ("No", "Yes"), compute_value=computer(value)) for value, name in zip(values, names)) else: - varvalues = ("0", "1") if self.output_type == self.Categorical01 \ - else ("No", "Yes") - return tuple(DiscreteVariable( - name, varvalues, compute_value=computer(value) - ) for value, name in zip(values, names)) + return tuple( + ContinuousVariable( + name, compute_value=computer(value)) + for value, name in zip(values, names)) if __name__ == "__main__": # pragma: no cover diff --git a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py b/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py index 80cb8d8f..aded9c5c 100644 --- a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py @@ -247,7 +247,7 @@ def test_output_string(self): def test_output_discrete(self): w = self.widget w.delimiter = " " - w.output_type = w.Categorical01 + w.output_type = w.Categorical attr = DiscreteVariable( "x", @@ -266,15 +266,6 @@ def test_output_discrete(self): self.send_signal(w.Inputs.data, data) out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["x", "bar", "baz", "crux", "foo"]) - for attr in out.domain.attributes[1:]: - self.assertTrue(attr.is_discrete) - self.assertEqual(attr.values, ("0", "1")) - np.testing.assert_equal(out.X, exp_hot) - - w.controls.output_type.buttons[w.NoYes].click() - out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["x", "bar", "baz", "crux", "foo"]) for attr in out.domain.attributes[1:]: @@ -282,7 +273,7 @@ def test_output_discrete(self): self.assertEqual(attr.values, ("No", "Yes")) np.testing.assert_equal(out.X, exp_hot) - w.controls.output_type.buttons[w.Numerical01].click() + w.controls.output_type.buttons[w.Numerical].click() out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["x", "bar", "baz", "crux", "foo"]) @@ -303,20 +294,11 @@ def test_output_discrete(self): def test_output_types_string(self): w = self.widget w.delimiter = "," - w.output_type = w.Categorical01 + w.output_type = w.Categorical self.send_signal(w.Inputs.data, self.small_table) counts = np.array([[0, 0, 1], [3, 1, 0], [1, 0, 2], [0, 0, 0]]) - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["bar", "baz", "foo (1)"]) - for attr in out.domain.attributes: - self.assertTrue(attr.is_discrete) - self.assertEqual(attr.values, ("0", "1")) - np.testing.assert_equal(out.X, counts > 0) - - w.controls.output_type.buttons[w.NoYes].click() out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["bar", "baz", "foo (1)"]) @@ -325,7 +307,7 @@ def test_output_types_string(self): self.assertEqual(attr.values, ("No", "Yes")) np.testing.assert_equal(out.X, counts > 0) - w.controls.output_type.buttons[w.Numerical01].click() + w.controls.output_type.buttons[w.Numerical].click() out = self.get_output(w.Outputs.data) self.assertEqual([attr.name for attr in out.domain.attributes], ["bar", "baz", "foo (1)"]) From 3e7fb793e22cf01bfeacf0478e3cec4fd2ac9993 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 22 Aug 2025 23:06:35 +0200 Subject: [PATCH 3/3] Text to Columns: Rename to Split --- .../widgets/icons/{TextToColumns.svg => Split.svg} | 0 .../widgets/{owtexttocolumns.py => owsplit.py} | 13 +++++++------ .../{test_owtexttocolumns.py => test_owsplit.py} | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) rename orangecontrib/prototypes/widgets/icons/{TextToColumns.svg => Split.svg} (100%) rename orangecontrib/prototypes/widgets/{owtexttocolumns.py => owsplit.py} (95%) rename orangecontrib/prototypes/widgets/tests/{test_owtexttocolumns.py => test_owsplit.py} (98%) diff --git a/orangecontrib/prototypes/widgets/icons/TextToColumns.svg b/orangecontrib/prototypes/widgets/icons/Split.svg similarity index 100% rename from orangecontrib/prototypes/widgets/icons/TextToColumns.svg rename to orangecontrib/prototypes/widgets/icons/Split.svg diff --git a/orangecontrib/prototypes/widgets/owtexttocolumns.py b/orangecontrib/prototypes/widgets/owsplit.py similarity index 95% rename from orangecontrib/prototypes/widgets/owtexttocolumns.py rename to orangecontrib/prototypes/widgets/owsplit.py index 46f9a582..05fe042a 100644 --- a/orangecontrib/prototypes/widgets/owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/owsplit.py @@ -118,11 +118,13 @@ def __hash__(self): return hash((self.variable, self.value, self.delimiter, self.onehot)) -class OWTextToColumns(OWWidget): - name = "Text to Columns" +class OWSplit(OWWidget): + name = "Split" description = "Split text or categorical variables into binary indicators" - icon = "icons/TextToColumns.svg" - keywords = ["split"] + icon = "icons/Split.svg" + keywords = ["text to columns", "word encoding", "questionnaire", "survey", + "term", "word presence", "word counts", "categorical encoding", + "indicator variables"] priority = 700 replaces = ["orangecontrib.prototypes.widgets.owsplit.OWSplit"] @@ -232,5 +234,4 @@ def _get_new_columns(self, values, computer): if __name__ == "__main__": # pragma: no cover - WidgetPreview(OWTextToColumns).run(Table.from_file( - "tests/orange-in-education.tab")) + WidgetPreview(OWSplit).run(Table.from_file("tests/orange-in-education.tab")) diff --git a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py b/orangecontrib/prototypes/widgets/tests/test_owsplit.py similarity index 98% rename from orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py rename to orangecontrib/prototypes/widgets/tests/test_owsplit.py index aded9c5c..34aa6f87 100644 --- a/orangecontrib/prototypes/widgets/tests/test_owtexttocolumns.py +++ b/orangecontrib/prototypes/widgets/tests/test_owsplit.py @@ -7,8 +7,8 @@ from Orange.data import Table, StringVariable, Domain, DiscreteVariable from Orange.widgets.tests.base import WidgetTest -from orangecontrib.prototypes.widgets.owtexttocolumns import \ - OWTextToColumns, SplitColumnOneHot, get_substrings, OneHotStrings, \ +from orangecontrib.prototypes.widgets.owsplit import \ + OWSplit, SplitColumnOneHot, get_substrings, OneHotStrings, \ DiscreteEncoding, SplitColumnCounts, CountStrings @@ -149,9 +149,9 @@ def test_discrete_metas(self): -class TestOWTextToColumns(WidgetTest): +class TestOWSplit(WidgetTest): def setUp(self): - self.widget = self.create_widget(OWTextToColumns) + self.widget = self.create_widget(OWSplit) test_path = os.path.dirname(os.path.abspath(__file__)) self.data = Table.from_file(os.path.join(test_path, "orange-in-education.tab")) self._create_simple_corpus()