Word Generator#
Scope#
This notebook was inspired by the great video proposed by David Louapre available on his Youtube channel “Science Etonnante”.
https://sciencetonnante.wordpress.com/2015/11/06/la-machine-a-inventer-des-mots-version-ikea/
Here the word generator is embedded in a class.
The video#
IPython.display.YouTubeVideo("YsR7r2378j0")
The Class that manage the word generator#
class word_generator:
def __init__(self, dic_file):
# Input file containing one word per line, and its encoding
# Assumes one word per line but if the the words are followed by
# a space, a tab, a slash, a comma, etc....the end of the line will be trimmed
self.dic_file = dic_file
self.encoding = "ISO-8859-1"
# Name of the output binary matrix, matrix image file and output txt file
count_file = r"count_FR.bin"
proba_matrix = r"matrix_FR.png"
self.outfile = r"output.txt"
self.read_dic()
def read_dic(self):
self.dico = [] # to store the words of the dictionnary
self.count = np.zeros((256, 256, 256), dtype="int32")
with codecs.open(self.dic_file, "r", self.encoding) as lines:
for l in lines:
# Trimming of the line :
# Split on white space, tab, slash backslah or open parenthesis
# and keep the first string, add EOL character
l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
self.dico.append(l2[:-1])
i, j = 0, 0
for k in [ord(c) for c in list(l2)]:
self.count[i, j, k] += 1
i = j
j = k
def plot(self):
count2D = self.count.sum(axis=0)
p2D = count2D.astype("float") / np.tile(sum(count2D.T), (256, 1)).T
p2D[np.isnan(p2D)] = 0
# For better contrast, we plot p^alpha instead of p
alpha = 0.33
p2Da = p2D**alpha
self.p2Da = p2Da[97:123, 97:123]
# We display only letters a to z, ie ASCII from 97 to 123.
plt.figure(figsize=(8, 8))
gr = plt.imshow(self.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
plt.axis("off")
cbar = plt.colorbar(gr, orientation="vertical")
for i in range(97, 123):
plt.text(
-1,
i - 97,
chr(i),
horizontalalignment="center",
verticalalignment="center",
)
plt.text(
i - 97,
-1,
chr(i),
horizontalalignment="center",
verticalalignment="center",
)
def __call__(self):
# For the random generator : what is the minimum and maximum number of letters
# in the words that we want to generate, and how many words for each length
smin, smax = 4, 12
K = 5
# Compute the probabilities by normalizing the counts
s = self.count.sum(axis=2)
st = np.tile(s.T, (256, 1, 1)).T
p = self.count.astype("float") / st
p[np.isnan(p)] = 0
f = codecs.open(self.outfile, "w", self.encoding)
# Generate words
for size in range(smin, smax + 1):
total = 0
while total < K:
i, j = 0, 0
res = ""
while not j == 10:
k = choice(range(256), 1, p=p[i, j, :])[0]
res = res + chr(k)
i, j = j, k
if len(res) == 1 + size:
x = res[:-1]
if res[:-1] in self.dico:
x = res[:-1] + "*"
total += 1
print(x)
f.write(x + "\n")
f.close()
<>:25: SyntaxWarning: invalid escape sequence '\('
<>:25: SyntaxWarning: invalid escape sequence '\('
/tmp/ipykernel_1318/599884686.py:25: SyntaxWarning: invalid escape sequence '\('
l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
French#
gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
gen_FR.plot()
plt.show()
/tmp/ipykernel_1318/599884686.py:25: SyntaxWarning: invalid escape sequence '\('
l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[4], line 1
----> 1 gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
2 gen_FR.plot()
3 plt.show()
Cell In[3], line 14, in word_generator.__init__(self, dic_file)
11 proba_matrix = r"matrix_FR.png"
12 self.outfile = r"output.txt"
---> 14 self.read_dic()
Cell In[3], line 20, in word_generator.read_dic(self)
17 self.dico = [] # to store the words of the dictionnary
19 self.count = np.zeros((256, 256, 256), dtype="int32")
---> 20 with codecs.open(self.dic_file, "r", self.encoding) as lines:
21 for l in lines:
22 # Trimming of the line :
23 # Split on white space, tab, slash backslah or open parenthesis
24 # and keep the first string, add EOL character
25 l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
File <frozen codecs>:918, in open(filename, mode, encoding, errors, buffering)
FileNotFoundError: [Errno 2] No such file or directory: '_DATA/dic/FR_aisi.txt'
gen_FR()
Swedish#
gen_SE = word_generator(r"_DATA/dic/SE_aisi.txt")
gen_SE.plot()
gen_SE()
Compare french and swedisch#
fig = plt.figure()
ax = fig.add_subplot(1, 2, 1)
ax.imshow(gen_FR.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
plt.text(
-1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.text(
i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.title("French")
ax = fig.add_subplot(1, 2, 2)
ax.imshow(gen_SE.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
plt.text(
-1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.text(
i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.title("Swedisch")
plt.show()