diff --git a/src/ilp_keyboard_layout_optimization/data_aquisition/chars.py b/src/ilp_keyboard_layout_optimization/data_aquisition/chars.py index 7cc435beb49630e9f53f4e0e18fd264049581dd7..cb16d6ab3331a8a541feba62ac2b6e7cd5bb140e 100644 --- a/src/ilp_keyboard_layout_optimization/data_aquisition/chars.py +++ b/src/ilp_keyboard_layout_optimization/data_aquisition/chars.py @@ -6,7 +6,7 @@ import string from itertools import product from typing import Optional, Tuple, Union -from ..type_aliases import Bigram, CharTuple +from ..type_aliases import Bigram, Char, CharTuple class Chars: @@ -76,3 +76,38 @@ class Chars: "".join(bigram_tuple) for bigram_tuple in product(self.chars, repeat=2) ) return self._bis + + def remove(self, char: Char): + self.chars = self.chars.replace(char, "") + + @staticmethod + def is_bigram(chars: str) -> bool: + """Check a character string for length two + + Parameters + ---------- + chars : str + string to check for its length + + Returns + ------- + bool + True, if provided string is of length two, False otherwise + """ + return len(chars) == 2 + + @staticmethod + def is_monogram(char: str) -> bool: + """Check a character string for length one + + Parameters + ---------- + char : str + string to check for its length + + Returns + ------- + bool + True, if provided string is of length one, False otherwise + """ + return len(char) == 1 diff --git a/src/ilp_keyboard_layout_optimization/data_aquisition/probabilities.py b/src/ilp_keyboard_layout_optimization/data_aquisition/probabilities.py new file mode 100644 index 0000000000000000000000000000000000000000..a699fd2dd3d24b366e7c16647ccfdf0dfb0f9dca --- /dev/null +++ b/src/ilp_keyboard_layout_optimization/data_aquisition/probabilities.py @@ -0,0 +1,284 @@ +"""This module contains a class representing (special) character counts""" + +__all__ = ["CharProbs"] + +import csv +import os +from collections import defaultdict +from os.path import abspath, basename +from typing import List, Optional +from urllib.parse import urlparse +from urllib.request import urlopen + +from .chars import Chars +from ..type_aliases import Bigram, Char, ProbDict + + +class CharProbs: + """Instances represent all relevant (special) character probabilities + + Parameters + ---------- + chars : Chars + the set of characters to consider + mono_urls : str + download URLs for the monogram probabilities including their file name + bi_urls : str + download URLs for the bigram probabilities including their file name + """ + + MONO_URLS: List[str] = [ + "http://www.ids-mannheim.de/fileadmin/kl/derewo/" + "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv", + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/" + "1-gramme.15.txt?rev=tip", + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/" + "1-gramme.wiki.txt?rev=tip", + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/1gramme.txt?rev=tip", + ] + + BI_URLS: List[str] = [ + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/2-gramme.15.txt?rev=tip", + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/" + "2-gramme.wiki.txt?rev=tip", + "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/2gramme.txt?rev=tip", + ] + + def __init__( + self, + chars: Chars = None, + mono_urls: List[str] = None, + bi_urls: List[str] = None, + ): + self.chars: Optional[str] = None if chars is None else chars.chars + self._no_char_restr = self.chars is None + self.mono_urls: List[str] = self.MONO_URLS if mono_urls is None else mono_urls + self.bi_urls: List[str] = self.BI_URLS if bi_urls is None else bi_urls + self.mono_filenames: List[str] = self.download_raw_to_files(self.mono_urls) + self.bi_filenames: List[str] = self.download_raw_to_files(self.bi_urls) + self._mono_probs = self._extract_monogram_data() + self._bi_probs = self._extract_bigram_data() + + @staticmethod + def download_raw_to_file(url: str, overwrite: bool = False) -> str: + """Download raw data from a URL + + Parameters + ---------- + url : str + download URL including file name + overwrite : bool + if True a present file gets replaced by the specified file, otherwise + (default) local file stays and does not get replaced + + Returns + ------- + str + file name of the raw data on disc extracted from the URL + """ + filename = abspath(basename(urlparse(url).path)) + if overwrite or not os.path.exists(filename): + with urlopen(url) as response, open(filename, "wb") as out_file: + downloaded_raw_data = response.read() + out_file.write(downloaded_raw_data) + return filename + + @staticmethod + def download_raw_to_files(urls: List[str]) -> List[str]: + """Download raw data from a URLs + + Parameters + ---------- + urls : list of str + download URLs including file names + + Returns + ------- + list of str + files name of the raw data on disc extracted from the URLs + """ + filenames = [] + for url in urls: + filenames.append(CharProbs.download_raw_to_file(url)) + return filenames + + def _extract_monogram_data(self) -> ProbDict: + mono_probs = defaultdict(float) + for mono_filename in self.mono_filenames: + new_mono_probs = defaultdict(float) + match basename(mono_filename): + case "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv": + new_mono_probs = self._extract_derechar_monos(mono_filename) + case "1-gramme.15.txt" | "1-gramme.wiki.txt" | "1gramme.txt": + new_mono_probs = self._extract_arnes_probs(mono_filename) + mono_probs = CharProbs._merge_probs(mono_probs, new_mono_probs) + return mono_probs + + def _extract_derechar_monos(self, filename: str) -> ProbDict: + chars_left_to_find = Chars(self.chars) + with open(filename, encoding="UTF-8") as mono_csv_file: + mono_csv_dialect: str = "derechar" + csv.register_dialect( + mono_csv_dialect, delimiter=" ", quoting=csv.QUOTE_NONE + ) + reader = csv.DictReader( + mono_csv_file, + fieldnames=[ + "probability", + "absolute_count", + "hexadecimal Unicode code point", + "decimal value corresponding to the code", + "unicode general category", + "glyph", + "name of the symbol", + "unicode block designation", + ], + dialect=mono_csv_dialect, + ) + total_sum: float = 0.0 + mono_probs_per_hex_code: ProbDict = defaultdict(float) + for row in reader: + try: + probability = float(row["probability"]) + except ValueError: + continue + assert probability + if self._no_char_restr or row["glyph"] in self.chars: + mono_probs_per_hex_code[ + row["hexadecimal Unicode code point"] + ] = probability + assert mono_probs_per_hex_code[ + row["hexadecimal Unicode code point"] + ] == float(row["probability"]) + total_sum += probability + if chars_left_to_find is not None: + chars_left_to_find.remove(row["glyph"]) + assert total_sum == sum(mono_probs_per_hex_code.values()) + if chars_left_to_find is not None and not chars_left_to_find.chars: + break + if self._no_char_restr: + assert CharProbs._almost_equal_to_one(total_sum) + assert len(mono_probs_per_hex_code) == reader.line_num - 2 + del mono_probs_per_hex_code[""] + assert len(mono_probs_per_hex_code) == reader.line_num - 3 + else: + assert total_sum <= 1.0 + assert len(mono_probs_per_hex_code) == len(self.chars) + mono_probs_per_hex_code = CharProbs.strip_and_normalize_probs( + mono_probs_per_hex_code + ) + assert CharProbs._almost_equal_to_one(sum(mono_probs_per_hex_code.values())) + mono_probs: ProbDict = defaultdict(float) + for code, prob in mono_probs_per_hex_code.items(): + mono_probs[chr(int(code[2:], 16))] = prob + assert CharProbs._almost_equal_to_one(sum(mono_probs.values())) + return mono_probs + + @staticmethod + def _almost_equal_to_one(number: float) -> bool: + return round(number, 8) == 1.0 + + @staticmethod + def _merge_probs(probs_origin: ProbDict, probs_to_merge: ProbDict) -> ProbDict: + for char, prob in probs_to_merge.items(): + probs_origin[char] += prob + normed_and_stripped_probs = CharProbs.strip_and_normalize_probs(probs_origin) + assert not normed_and_stripped_probs or CharProbs._almost_equal_to_one( + sum(normed_and_stripped_probs.values()) + ) + return normed_and_stripped_probs + + @staticmethod + def strip_and_normalize_probs(probs: ProbDict) -> ProbDict: + """Strip from all zero elements and ensure values sum up to one + + Parameters + ---------- + probs : ProbDict + the dictionary of probabilities + + Returns + ------- + ProbDict + the stripped and normalized dict, in which the elements' values are + guaranteed to sum up to one and no zeros are left + """ + absolute_sum = sum(probs.values()) + stripped_and_normalized_probs = defaultdict(float) + for char, count in probs.items(): + if count != 0: + stripped_and_normalized_probs[char] = count / absolute_sum + return stripped_and_normalized_probs + + def _extract_arnes_probs(self, filename: str) -> ProbDict: + if CharProbs._currently_processing_bigrams(filename): + chars_left_to_find = set(Chars(self.chars).bis) + else: + chars_left_to_find = set(Chars(self.chars).monos) + with open(filename, encoding="utf-8-sig") as txt_file: + total_sum: int = 0 + probs: ProbDict = defaultdict(int) + for row in txt_file: + count_and_glyphs: List[str, str] = ( + row.lstrip().rstrip("\n").split(" ", 1) + ) + if CharProbs._should_be_bigram_but_is_not( + count_and_glyphs[1], filename + ) or CharProbs._is_replace_char(count_and_glyphs[1]): + continue + assert ( + Chars.is_bigram(count_and_glyphs[1]) + if CharProbs._currently_processing_bigrams(filename) + else Chars.is_monogram(count_and_glyphs[1]) + ) + assert isinstance(count_and_glyphs, list) + assert len(count_and_glyphs) == 2 + try: + count = int(count_and_glyphs[0]) + except ValueError: + continue + if self._no_char_restr or ( + count_and_glyphs[1][0] in self.chars + and count_and_glyphs[1][-1] in self.chars + ): + probs[count_and_glyphs[1]] = count + total_sum += count + assert probs[count_and_glyphs[1]] == int(count_and_glyphs[0]) + if chars_left_to_find is not None: + chars_left_to_find.discard(count_and_glyphs[1]) + assert total_sum == sum(probs.values()) + if chars_left_to_find is not None and not chars_left_to_find: + break + normed_and_stripped_probs = CharProbs.strip_and_normalize_probs(probs) + assert CharProbs._almost_equal_to_one(sum(normed_and_stripped_probs.values())) + return normed_and_stripped_probs + + @staticmethod + def _should_be_bigram_but_is_not(glyphs: Bigram, filename: str) -> bool: + return "2" in basename(filename) and len(glyphs) == 1 + + @staticmethod + def _is_replace_char(glyphs: Char | Bigram) -> bool: + return "65533" in " ".join(str(ord(char)) for char in glyphs) + + @staticmethod + def _currently_processing_bigrams(filename: str) -> bool: + return "2" in basename(filename) + + def _extract_bigram_data(self) -> ProbDict: + bi_probs = defaultdict(float) + for bi_filename in self.bi_filenames: + bi_probs = CharProbs._merge_probs( + bi_probs, self._extract_arnes_probs(bi_filename) + ) + return bi_probs + + @property + def mono_probs(self) -> ProbDict: + """The probabilities for all single (special) characters""" + return self._mono_probs + + @property + def bi_probs(self) -> ProbDict: + """The bigram probabilities for all (special) character pairs""" + return self._bi_probs diff --git a/src/ilp_keyboard_layout_optimization/receive_data.py b/src/ilp_keyboard_layout_optimization/receive_data.py deleted file mode 100644 index 4841175717d9e174051beb09f0de27725aa70200..0000000000000000000000000000000000000000 --- a/src/ilp_keyboard_layout_optimization/receive_data.py +++ /dev/null @@ -1,150 +0,0 @@ -"""This module contains a class representing (special) character counts""" - -__all__ = ["CharProbs"] - -import csv -from math import comb -from os.path import abspath, basename -from typing import Optional -from urllib.request import urlopen - -from src.ilp_keyboard_layout_optimization.type_aliases import CharSet, CharTuple - - -class CharProbs: - """Instances represent all relevant (special) character probabilities - - Parameters - ---------- - mono_url : str - download URL for the monogram probabilities including the file name - bi_url : str - download URL for the bigram probabilities including the file name - """ - - def __init__( - self, - chars: CharTuple = None, - mono_url: str = "http://www.ids-mannheim.de/fileadmin/kl/derewo/" - "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv", - bi_url: str = "http://practicalcryptography.com/media/cryptanalysis/files/" - "german_bigrams.txt", - ): - self.chars: Optional[CharSet] = None if chars is None else set(chars) - self.mono_url: str = mono_url - self.bi_url: str = bi_url - self.mono_filename: str = self.download_raw_to_file(self.mono_url) - self.bi_filename: str = self.download_raw_to_file(self.bi_url) - self._mono_probs = self._extract_monogram_data() - self._bi_probs = self._extract_bigram_data() - - @staticmethod - def download_raw_to_file(url: str) -> str: - """Download raw data from a URL - - Parameters - ---------- - url : str - download URL including file name - - Returns - ------- - str - file name of the raw data on disc extracted from the URL - """ - filename = abspath(basename(url)) - with urlopen(url) as response, open(filename, "wb") as out_file: - downloaded_raw_data = response.read() - out_file.write(downloaded_raw_data) - return filename - - def _extract_monogram_data(self) -> dict[str, float]: - with open(self.mono_filename, encoding="UTF-8") as mono_csv_file: - mono_csv_dialect: str = "derechar" - csv.register_dialect( - mono_csv_dialect, delimiter=" ", quoting=csv.QUOTE_NONE - ) - reader = csv.DictReader( - mono_csv_file, - fieldnames=[ - "probability", - "absolute_count", - "hexadecimal Unicode code point", - "decimal value corresponding to the code", - "unicode general category", - "glyph", - "name of the symbol", - "unicode block designation", - ], - dialect=mono_csv_dialect, - ) - total_sum: float = 0.0 - mono_probs: dict[str, float] = {} - for row in reader: - try: - probability = float(row["probability"]) - except ValueError: - continue - total_sum += probability - if self.chars is None or row["glyph"] in self.chars: - mono_probs[row["glyph"]] = probability - assert mono_probs[row["glyph"]] == float(row["probability"]) - assert round(total_sum, 8) == 1.0 if self.chars is None else total_sum <= 1.0 - assert ( - len(mono_probs) == reader.line_num - 4 - if self.chars is None - else len(self.chars) - ) - return mono_probs - - def _extract_bigram_data(self) -> dict[str, float]: - bi_txt_dialect: str = "jamestxt" - csv.register_dialect(bi_txt_dialect, delimiter=" ") - with open(self.bi_filename, encoding="UTF-8") as bi_csv_file: - reader = csv.DictReader( - bi_csv_file, - fieldnames=[ - "bigram", - "absolute_count", - ], - dialect=bi_txt_dialect, - ) - bi_probs: dict[str, float] = {} - for row in reader: - try: - absolute_count = int(row["absolute_count"]) - except ValueError: - continue - assert len(row["bigram"]) == 2 - if self.chars is None or ( - row["bigram"][0] in self.chars and row["bigram"][1] in self.chars - ): - bi_probs[row["bigram"]] = absolute_count - absolute_sum = sum(bi_probs.values()) - for character, count in bi_probs.items(): - bi_probs[character] = count / absolute_sum - assert round(sum(bi_probs.values()), 8) == 1 - assert len(bi_probs) == reader.line_num or len(bi_probs) <= comb( - reader.line_num, 2 - ) - return bi_probs - - @property - def mono_probs(self): - """The probabilities for all single (special) characters""" - return self._mono_probs - - @property - def bi_probs(self): - """The bigram probabilities for all (special) character pairs""" - return self._bi_probs - - -if __name__ == "__main__": - CharProbs( - None, - "http://www.ids-mannheim.de/fileadmin/kl/derewo/" - "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv", - "http://practicalcryptography.com/media/cryptanalysis/files/" - "german_bigrams.txt", - ) diff --git a/src/ilp_keyboard_layout_optimization/type_aliases.py b/src/ilp_keyboard_layout_optimization/type_aliases.py index 1eb2b5db5aa09d43afb9abe2a04d66e3809fdae6..6160ee22f3d0527b7de5f13a5ea6c9ce5f3fef1c 100644 --- a/src/ilp_keyboard_layout_optimization/type_aliases.py +++ b/src/ilp_keyboard_layout_optimization/type_aliases.py @@ -5,17 +5,19 @@ __all__ = [ "Char", "CharPosPair", "CharPosQuadruple", - "CharSet", "CharTuple", "LinCosts", "LinVars", "Pos", "PosPair", "PosTuple", + "ProbDict", "QuadCosts", "QuadVars", ] +from collections import defaultdict + Char = str """A (special) character""" Pos = str @@ -28,8 +30,6 @@ PosPair = tuple[Pos, Pos] """A tuple of two positions""" CharTuple = tuple[Char, ...] """A tuple of several (special) characters""" -CharSet = set[Char] -"""A set of several (special) characters""" PosTuple = tuple[Pos, ...] """A tuple of several positions""" LinCosts = dict[CharPosPair, float] @@ -42,3 +42,5 @@ LinVars = dict[CharPosPair, bool] """A dictionary of binary decisions of assigning (special) characters to positions""" QuadVars = dict[CharPosQuadruple, bool] """A dictionary of binary vars assigning two (special) characters to two positions""" +ProbDict = defaultdict[Char | Bigram, float] +"""A dictionary of probabilities, defaulting to zero""" diff --git a/test/test_chars.py b/test/test_chars.py index 210849769b3bc54f5700460e358cd0a0f9f367eb..4ef553b2fbb465c0b661ca81754da8d7be68f54d 100644 --- a/test/test_chars.py +++ b/test/test_chars.py @@ -64,6 +64,23 @@ def test_chars_monograms(): assert Chars().monos +def test_chars_method_remove(): + assert Chars().remove + + +def test_chars_method_remove_call_with_parameter(): + test_chars = Chars() + assert test_chars.remove(test_chars.chars[0]) is None + + +def test_chars_method_remove_result(): + test_chars = Chars() + first_char = test_chars.chars[0] + assert first_char in test_chars.chars + test_chars.remove(test_chars.chars[0]) + assert first_char not in test_chars.chars + + def test_chars_monograms_multiple_times(): test_chars = Chars() first_time_monos = test_chars.monos @@ -146,3 +163,36 @@ def test_chars_bigrams_after_resetting(): assert bigram in second_bigram_list second_bigram_list.remove(bigram) assert not second_bigram_list + + +def test_attribute_is_bigram(): + assert Chars.is_bigram + + +@given(hst.text(min_size=2, max_size=2)) +def test_attribute_is_bigram(chars): + assert Chars.is_bigram(chars) + + +@given(hst.text(min_size=1, max_size=1)) +def test_attribute_is_too_short_for_bigram(chars): + assert not Chars.is_bigram(chars) + + +@given(hst.text(min_size=3, max_size=10)) +def test_attribute_is_too_long_for_bigram(chars): + assert not Chars.is_bigram(chars) + + +def test_attribute_is_monogram(): + assert Chars.is_monogram + + +@given(hst.text(min_size=1, max_size=1)) +def test_attribute_is_monogram(char): + assert Chars.is_monogram(char) + + +@given(hst.text(min_size=2, max_size=10)) +def test_attribute_is_not_monogram(chars): + assert not Chars.is_monogram(chars) diff --git a/test/test_probabilities.py b/test/test_probabilities.py new file mode 100644 index 0000000000000000000000000000000000000000..314cf2284f182b7f95d4a0a38e665ee96b11f689 --- /dev/null +++ b/test/test_probabilities.py @@ -0,0 +1,352 @@ +import os +from collections import defaultdict +from typing import Callable, List +from urllib.request import urlopen + +import pytest +from hypothesis import given, strategies as hst +from hypothesis.strategies import composite + +from ilp_keyboard_layout_optimization.data_aquisition.chars import Chars +from ilp_keyboard_layout_optimization.data_aquisition.probabilities import CharProbs +from ilp_keyboard_layout_optimization.type_aliases import ProbDict + + +@pytest.fixture(scope="session") +def chars_probs(): + return CharProbs() + + +@composite +def probs(draw: Callable): + numbers = defaultdict(int) + for key in range(draw(hst.integers(min_value=1, max_value=100))): + numbers[key] = draw(hst.integers(min_value=1)) + normalized_numbers = CharProbs.strip_and_normalize_probs(numbers) + return normalized_numbers + + +@pytest.fixture(scope="session") +def default_probs(): + return defaultdict(int) + + +@pytest.fixture(scope="session") +def mono_url(): + return "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/1gramme.txt?rev=tip" + + +@pytest.fixture(scope="session") +def bi_url(): + return "https://hg.sr.ht/~arnebab/evolve-keyboard-layout/raw/2gramme.txt?rev=tip" + + +@pytest.fixture(scope="session") +def custom_chars_probs(mono_url, bi_url): + return CharProbs(Chars("ABC"), [mono_url], [bi_url]) + + +def test_class_attribute_mono_urls(): + assert CharProbs.MONO_URLS + + +def test_class_attribute_bi_urls(): + assert CharProbs.BI_URLS + + +def test_static_download_method(): + assert CharProbs.download_raw_to_file + + +def test_static_downloads_method(): + assert CharProbs.download_raw_to_files + + +def test_class_method_merge_probs(): + assert CharProbs._merge_probs + + +@given(probs(), probs()) +def test_class_method_merge_probs(probs_1, probs_2): + assert CharProbs._merge_probs(probs_1, probs_2) + + +@given(probs(), probs()) +def test_class_method_merge_probs(probs_1, probs_2): + assert sum(CharProbs._merge_probs(probs_1, probs_2).values()) == pytest.approx(1) + + +@given(probs()) +def test_class_method_merge_with_default_probs(default_probs, probs_2): + assert CharProbs._merge_probs(default_probs, probs_2) + + +@given(probs()) +def test_class_method_merge_with_default_probs(default_probs, probs_2): + assert sum( + CharProbs._merge_probs(default_probs, probs_2).values() + ) == pytest.approx(1) + + +def test_class_method_almost_equal_to_one(): + assert CharProbs._almost_equal_to_one + + +def test_class_method_normalize_probs(): + assert CharProbs.strip_and_normalize_probs + + +def test_class_method_strip_and_normalize_probs_sums_up_to_one(): + test_dict = {str(integer): integer for integer in range(5)} + assert sum(CharProbs.strip_and_normalize_probs(test_dict).values()) == 1.0 + + +def test_class_method_strip_and_normalize_probs_stripped_from_zeros(): + test_dict = {str(integer): integer for integer in range(5)} + assert "0" not in set(CharProbs.strip_and_normalize_probs(test_dict).keys()) + + +def test_init_attribute_chars(custom_chars_probs): + assert custom_chars_probs.chars + + +def test_init_attribute_mono_urls(chars_probs): + assert chars_probs.mono_urls + + +def test_init_attribute_bi_urls(chars_probs): + assert chars_probs.bi_urls + + +def test_init_attribute_mono_filenames(chars_probs): + assert chars_probs.mono_filenames + + +def test_init_attribute_bi_filenames(chars_probs): + assert chars_probs.bi_filenames + + +def test_init_attribute_download_function(chars_probs): + assert chars_probs.download_raw_to_file + + +def test_download_function_result(custom_chars_probs, bi_url): + file = custom_chars_probs.download_raw_to_file(bi_url) + assert os.path.exists(file) + + +def test_download_function_type(custom_chars_probs, bi_url): + assert isinstance(custom_chars_probs.download_raw_to_file(bi_url), str) + + +def test_downloads_function_result(custom_chars_probs, bi_url, mono_url): + files = custom_chars_probs.download_raw_to_files([bi_url, mono_url]) + for file in files: + assert os.path.exists(file) + + +def test_downloads_function_type(custom_chars_probs, bi_url, mono_url): + assert isinstance( + custom_chars_probs.download_raw_to_files([bi_url, mono_url]), list + ) + + +def test_init_attribute_downloads_function(custom_chars_probs): + assert custom_chars_probs.download_raw_to_files + + +def test_init_attribute_extract_monogram_data(chars_probs): + assert chars_probs._extract_monogram_data + + +def test_init_attribute_extract_bigram_data(chars_probs): + assert chars_probs._extract_bigram_data + + +def test_init_attribute_mono_probs(chars_probs): + assert chars_probs.mono_probs + + +def test_init_attribute_bi_probs(chars_probs): + assert chars_probs.bi_probs + + +def test_init_attribute_currently_processing_bigrams(): + assert CharProbs._currently_processing_bigrams + + +def test_init_attribute_currently_processing_bigrams_result_for_bigram(bi_url): + assert CharProbs._currently_processing_bigrams(bi_url) + + +def test_init_attribute_currently_processing_bigrams_result_for_monogram(mono_url): + assert not CharProbs._currently_processing_bigrams(mono_url) + + +def test_init_attribute_bi_probs_type(custom_chars_probs): + assert isinstance(custom_chars_probs.bi_probs, ProbDict.__origin__) + + +def test_init_attribute_mono_probs_type(custom_chars_probs): + assert isinstance(custom_chars_probs.mono_probs, ProbDict.__origin__) + + +def test_init_attribute_bi_probs_sum_up_to_one(custom_chars_probs): + assert sum(custom_chars_probs.bi_probs.values()) == pytest.approx(1) + + +def test_init_attribute_mono_probs_sum_up_to_one(custom_chars_probs): + assert sum(custom_chars_probs.mono_probs.values()) == pytest.approx(1) + + +def test_init_attribute_chars_type(custom_chars_probs): + assert custom_chars_probs.chars + + +def test_init_attribute_extract_derechar_monos(chars_probs): + filename = [ + filename + for filename in chars_probs.mono_filenames + if "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv" in filename + ][0] + assert isinstance( + chars_probs._extract_derechar_monos(filename), ProbDict.__origin__ + ) + + +def test_init_attribute_extract_derechar_sum_up_to_one(chars_probs): + filename = [ + filename + for filename in chars_probs.mono_filenames + if "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv" in filename + ][0] + assert sum(chars_probs._extract_derechar_monos(filename).values()) == pytest.approx( + 1 + ) + + +def test_init_attribute_extract_arnes_monos(chars_probs): + assert chars_probs._extract_arnes_probs + + +@pytest.mark.parametrize( + "basename", ["1-gramme.15.txt", "1-gramme.wiki.txt", "1gramme.txt"] +) +def test_init_attribute_extract_arnes_monos_type(basename, chars_probs): + filename = [ + filename for filename in chars_probs.mono_filenames if basename in filename + ][0] + assert isinstance(chars_probs._extract_arnes_probs(filename), ProbDict.__origin__) + + +@pytest.mark.parametrize( + "basename", ["1-gramme.15.txt", "1-gramme.wiki.txt", "1gramme.txt"] +) +def test_init_attribute_extract_arnes_monos_sum_up_to_one(basename, chars_probs): + filename = [ + filename for filename in chars_probs.mono_filenames if basename in filename + ][0] + assert sum(chars_probs._extract_arnes_probs(filename).values()) == pytest.approx(1) + + +def test_init_default_monogram_file_download(chars_probs): + compare_files_contents_to_http_responses( + chars_probs.mono_filenames, chars_probs.mono_urls + ) + + +def compare_files_contents_to_http_responses(filenames: List[str], urls: List[str]): + for filename in filenames: + assert os.path.exists(filename) + for url, filename in zip(urls, filenames): + with urlopen(url) as response, open(filename) as file: + assert file.read() == response.read().decode().replace("\r\n", "\n") + + +def test_init_custom_monogram_file_download(custom_chars_probs): + compare_files_contents_to_http_responses( + custom_chars_probs.mono_filenames, custom_chars_probs.mono_urls + ) + + +def test_init_default_bigram_file_download(chars_probs): + compare_files_contents_to_http_responses( + chars_probs.bi_filenames, chars_probs.bi_urls + ) + + +def test_init_custom_bigram_file_download(custom_chars_probs): + compare_files_contents_to_http_responses( + custom_chars_probs.bi_filenames, custom_chars_probs.bi_urls + ) + + +def test_init_extract_monogram_data(chars_probs): + assert chars_probs._extract_monogram_data() + + +def test_init_extract_bigram_data(chars_probs): + assert chars_probs._extract_bigram_data() + + +def test_init_custom_bigram_data(custom_chars_probs): + bi_probs = custom_chars_probs.bi_probs + assert ( + "AA" in bi_probs + and "AB" in bi_probs + and "BB" in bi_probs + and "BA" in bi_probs + and "AC" in bi_probs + and "BC" in bi_probs + and "CC" in bi_probs + and "CA" in bi_probs + and "CB" in bi_probs + ) + + +def test_init_custom_monogram_data(custom_chars_probs): + mono_probs = custom_chars_probs.mono_probs + assert "A" in mono_probs and "B" in mono_probs and "C" in mono_probs + + +def test_almost_equal_to_one_too_big(): + assert not CharProbs._almost_equal_to_one(1.00000001) + + +def test_almost_equal_to_one_too_small(): + assert not CharProbs._almost_equal_to_one(0.99999999) + + +def test_almost_equal_to_one_just_as_small(): + assert CharProbs._almost_equal_to_one(0.999999999) + + +def test_almost_equal_to_one_just_as_big(): + assert CharProbs._almost_equal_to_one(1.000000001) + + +def test_attr_is_replace_char(): + assert CharProbs._is_replace_char + + +def test_attr_is_replace_char(): + assert CharProbs._should_be_bigram_but_is_not + + +@given(hst.characters()) +def test_attr_is_replace_char(bi_url, char): + assert CharProbs._should_be_bigram_but_is_not(char, bi_url) + + +@given(hst.text(min_size=2, max_size=10)) +def test_attr_is_replace_char(bi_url, chars): + assert not CharProbs._should_be_bigram_but_is_not(chars, bi_url) + + +def test_is_replace_char(): + assert CharProbs._is_replace_char(chr(65533)) + + +@given(hst.characters()) +def test_is_not_replace_char(char): + assert not CharProbs._is_replace_char(char) diff --git a/test/test_receive_data.py b/test/test_receive_data.py deleted file mode 100644 index 2e7e193c443424b28b6fbf880828e0d2fb8b3502..0000000000000000000000000000000000000000 --- a/test/test_receive_data.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -from urllib.request import urlopen - -import pytest - -from src.ilp_keyboard_layout_optimization.receive_data import CharProbs - - -@pytest.fixture -def characters_count(): - return CharProbs() - - -@pytest.fixture() -def characters_count_custom(): - return CharProbs( - ("A", "B", "C"), - "http://www.ids-mannheim.de/fileadmin/kl/derewo/" - "DeReChar-v-uni-204-a-c-2018-02-28-1.0.csv", - "http://practicalcryptography.com/media/cryptanalysis/files/" - "german_bigrams.txt", - ) - - -def test_init_attribute_chars(characters_count_custom): - assert characters_count_custom.chars - - -def test_init_attribute_mono_url(characters_count): - assert characters_count.mono_url - - -def test_init_attribute_bi_url(characters_count): - assert characters_count.bi_url - - -def test_init_attribute_mono_filename(characters_count): - assert characters_count.mono_filename - - -def test_init_attribute_bi_filename(characters_count): - assert characters_count.bi_filename - - -def test_init_attribute_download_function(characters_count): - assert characters_count.download_raw_to_file - - -def test_init_attribute_extract_monogram_data(characters_count): - assert characters_count._extract_monogram_data - - -def test_init_attribute_extract_bigram_data(characters_count): - assert characters_count._extract_bigram_data - - -def test_init_attribute_mono_probs(characters_count): - assert characters_count.mono_probs - - -def test_init_attribute_bi_probs(characters_count): - assert characters_count.bi_probs - - -def test_init_default_monogram_file_download(characters_count): - assert os.path.exists(characters_count.mono_filename) - with urlopen(characters_count.mono_url) as response, open( - characters_count.mono_filename - ) as file: - assert file.read() == response.read().decode() - - -def test_init_custom_monogram_file_download(characters_count_custom): - with urlopen(characters_count_custom.mono_url) as response, open( - characters_count_custom.mono_filename - ) as file: - assert file.read() == response.read().decode() - - -def test_init_default_bigram_file_download(characters_count): - assert os.path.exists(characters_count.bi_filename) - with urlopen(characters_count.bi_url) as response, open( - characters_count.bi_filename - ) as file: - assert file.read() == response.read().decode() - - -def test_init_extract_monogram_data(characters_count): - assert characters_count._extract_monogram_data() - - -def test_init_extract_bigram_data(characters_count): - assert characters_count._extract_bigram_data() - - -def test_init_custom_bigram_data(characters_count_custom): - assert ( - "AA" in characters_count_custom.bi_probs - and "AB" in characters_count_custom.bi_probs.keys() - and "BB" in characters_count_custom.bi_probs - and "BA" in characters_count_custom.bi_probs - and "AC" in characters_count_custom.bi_probs - and "BC" in characters_count_custom.bi_probs - and "CC" in characters_count_custom.bi_probs - and "CA" in characters_count_custom.bi_probs - and "CB" in characters_count_custom.bi_probs - ) diff --git a/test/test_types.py b/test/test_types.py index a1601f66d6c8d8e915243cfb5044db6eecafd3be..a3e2385b1fd3192ff446fe8eae1bdfde8b7b8e36 100644 --- a/test/test_types.py +++ b/test/test_types.py @@ -1,3 +1,5 @@ +from collections import defaultdict + from ilp_keyboard_layout_optimization.costs import FreqTuple from ilp_keyboard_layout_optimization.type_aliases import ( Bigram, @@ -5,11 +7,12 @@ from ilp_keyboard_layout_optimization.type_aliases import ( CharPosPair, CharPosQuadruple, CharTuple, + LinCosts, + LinVars, Pos, PosPair, PosTuple, - LinCosts, - LinVars, + ProbDict, QuadCosts, QuadVars, ) @@ -66,3 +69,7 @@ def test_quad_costs(): def test_quad_vars(): assert QuadVars == dict[CharPosQuadruple, bool] + + +def test_prob_dict(): + assert ProbDict == defaultdict[Char | Bigram, float]