...
 
Commits (2)
\begin{module}[id=python-humanities-ex]
\importmhmodule[path=python/en/nutshell]{python-nutshell}
\lstset{language=Python}
\begin{note}
\begin{omtext}
\importmhmodule[path=python/en/functions]{python-functions}
\importmhmodule[path=python/en/fstring-literals]{python-fstring-literals}
\importmhmodule[repos=MiKoMH/TDM,path=doccomp/en/regexp-practical]{regexp-practical}
\lstset{language=python}
\begin{nomtext}
We will now see what we can do with
\mtrefiis[regexp-practical?regexp]{regular}{expression} in a practical example.
\end{omtext}
\end{note}
\end{nomtext}
\begin{frame}[fragile,allowframebreaks,label=slide.humanities-ex]
\frametitle{Example: Correcting and Anonymizing Documents}
\begin{itemize}
\item
\begin{example}
We write an a program that makes simple corrections on documents and also crosses
out all names.
We write a \trefi[python-functions] that makes simple corrections on documents and
also crosses out all names to anonymize.
\begin{itemize}
\item \nlex{The worst president of the US,arguably was George W. Bush, right?}
\item \nlex{However,are you famILIar with Paul Erd\H{o}s or Henri Poincar\'e?}\lec{Unicode}
\end{itemize}
Here is the program:
Here is the function
\begin{itemize}
\item we first initialize and load modules
\begin{lstlisting}
import re
import sys
\end{lstlisting}
\item then we decode the argument and put it into a variable
\begin{lstlisting}
s = sys.argv[1]
\end{lstlisting}
\item We put put a space after a comma,\lec{use \lstinline|r| string prefix for ``raw strings''}
\begin{lstlisting}
s = re.sub(r",(\S)", r", \1", s)
\end{lstlisting}
\item capitalize the first letter of a new sentence,
\begin{lstlisting}
s = re.sub(r"([\.\?!])\w*(\S)",
lambda (m):m.group(1),r" ".upper()+m.group(2), s)
\end{lstlisting}
\newpage
\item next we make abbreviations for regular expressions to save space
\begin{lstlisting}
c = "[A-Z]"
l = "[a-z]"
\end{lstlisting}
\item remove capital letters in the middle of words
\begin{lstlisting}
s = re.sub("({l})({c}+)({l})"
.format(l=l, c=c),
lambda (m):"{0}{1}{2}".format(m.group(1), m.group(2).lower(),
m.group(3)), s)
\end{lstlisting}%$
\item and we cross-out for official public versions of government documents,
\begin{lstlisting}
s = re.sub(r"({c}{l}+ ({c}{l}*(\.?) )?{c}{l}+)"
.format(l=l, c=c),
lambda (m):re.sub("\S", "X", m.group(1)),
s)
\end{lstlisting}
\item finally, we print the result
\begin{lstlisting}[language=Python]
print(s)
\end{lstlisting}%$
\end{itemize}
\nlex{The worst president of the US,arguably was George W. Bush, right?} becomes\\
\nlex{The worst president of the US, arguably was XXXXXX XX XXXX, right?}
\end{example}
\end{itemize}
\item we first add blanks after commata
\lstinputmhlisting[linerange=2-3]{python/ex/humanities-ex.py}
\item capitalize the first letter of a new sentence,
\lstinputmhlisting[linerange=4-6]{python/ex/humanities-ex.py}
\newpage
\item next we make abbreviations for regular expressions to save space
\lstinputmhlisting[linerange=7-8]{python/ex/humanities-ex.py}
\item remove capital letters in the middle of words
\lstinputmhlisting[linerange=9-11]{python/ex/humanities-ex.py}
\item and we cross-out for official public versions of government documents,
\lstinputmhlisting[linerange=12-14]{python/ex/humanities-ex.py}
\item finally, we return the result
\lstinputmhlisting[linerange=15-15]{python/ex/humanities-ex.py}
\end{itemize}
\nlex{The worst president of the US,arguably was George W. Bush, right?}\\ becomes\\
\nlex{The worst president of the US, arguably was XXXXXX XX XXXX, right?}
\end{example}
\end{itemize}
\end{frame}
\end{module}
%%% Local Variables:
......
import re
def corranon (s)
s = re.sub(r",(\S)", r", \1", s)
s = re.sub(r"([\.\?!])\w*(\S)",
lambda (m):m.group(1),r" ".upper()+m.group(2),
s)
c = "[A-Z]"
l = "[a-z]"
s = re.sub(f"({l})({c}+)({l})",
lambda (m):f"{m.group(1)}{m.group(2).lower()}{m.group(3)}",
s)
s = re.sub(f"({c}{l}+ ({c}{l}*(\.?) )?{c}{l}+)",
lambda (m):re.sub("\S", "X", m.group(1)),
s)