OVH Cloud OVH Cloud

Script num=c3=a9ro 4

6 réponses
Avatar
Olivier Miakinen
#!/bin/bash
####################################################################
# Name:
# decode_headers
#
# Description:
# Script used for decoding RFC 2047 MIME encoded headers.
#
# Example:
# decode_headers <<POTIRON
# Message-ID: <c5a9-1@part.org>
# From: arvo@part.org (=?ASCII?Q?Arvo?= =?L1?Q?=20?= =?UTF-8?Q?P=C3=A4rt?=)
# To: =?Latin1?Q?Fr=E9d=E9ric_Chopin?= <fred@chopin.org>,
# =?Latin2?Q?Anton=EDn_Dvo=F8=E1k?= <anton@dvorak.org>
# Cc: Arvo =?UTF-8?Q?P=C3=A4rt?= <arvo@part.org>
# References: <A65R-4d@chopin.org> <c5a7-3@part.org>
# <A72Q-5a@chopin.org>
# In-Reply-To: <A72Q-5a@chopin.org>
# Subject: Re: Going to =?Shift-JIS?B?k4yLngo=?= (Tokyo) =?UTF-8?B?zpEK?=
# =?UTF-8?B?zrjOrs69zrEK?= (Athens) and =?ISO-8859-5?Q?=BC=DE=E1=DA?=
# =?ISO-8859-5?Q?=D2=D0?= (Moscow)
# POTIRON
# ->
# Message-ID: <c5a9-1@part.org>
# From: arvo@part.org (Arvo Pärt)
# To: Frédéric Chopin <fred@chopin.org>, Antonín Dvořák <anton@dvorak.org>
# Cc: Arvo Pärt <arvo@part.org>
# References: <A65R-4d@chopin.org> <c5a7-3@part.org> <A72Q-5a@chopin.org>
# In-Reply-To: <A72Q-5a@chopin.org>
# Subject: Re: Going to 東京 (Tokyo) Αθήνα (Athens) and Москва (Moscow)
####################################################################

#
# The following constants should be adapted for your system.
# For example, the executable base64 could be at /usr/local/bin instead
# of /usr/bin, and it could require parameter -D instead of -d.
#
CUT="/usr/bin/cut"
TR="/usr/bin/tr"
ICONV="/usr/bin/iconv"
if [ "$(uname)" = "Linux" ]; then
# GNU/Linux
DECODE_QP="/usr/bin/qprint -d"
DECODE_B64="/usr/bin/base64 -d"
else
# MacOS X
DECODE_QP="/usr/local/bin/qprint -d"
DECODE_B64="/usr/bin/base64 -D"
fi

#
# This script uses the following global variables
# VARY
# used when a function has to return a string and not only a number
# DECODED_LINE
# The current state of a header being currently decoded
# STATUS
# Has three possible values:
# - "none" at the beginning of a new header
# - "decoded-word" after a correctly decoded MIME part
# - "normal" after any other string
#
VARY=""
DECODED_LINE=""
STATUS="none"

#
# Function: usage
#
usage()
{
printf "Usage: %s [OPTION...] [FILE...]\n" $0
printf "Decode headers from given files for RFC 2047 MIME encodings.\n"
printf "\n"
printf "With no FILE, or when FILE is -, read standard input.\n"
printf "\n"
printf " -h, --help Give this help list\n"
exit 1
}

#
# Function: decode_word
#
# Description:
# Check that the parameter is an encoded word, then decode it and
# convert it to UTF-8.
#
# Parameter:
# A single (possibly MIME-encoded) word.
#
# Return value:
# If decoding is ok, set the result into VARY and return 0
# Otherwise return 1
#
decode_word()
{
word="$*"
###################################################################
# An encoded word contains only ASCII characters in range from
# '!' (Ascii value 0x21) to '~' (Ascii value 0x7e). This excludes
# in particular the SPACE (Ascii 0x20) and the TAB (Ascii 0x09).
#
# More specifically, it consists of five parts separated by
# question marks '?'
# 1. A character "="
# 2. The charset, e.g. "UTF-8" or "ISO-8859-1"
# 3. The encoding, B or Q in upper or lower case
# 4. The encoded text
# 5. A character "="
###################################################################

# Check that:
# - there is no character outside range from '!' to '~'
# - the 1st part is a "="
# - the 5th part is a "=" and it is the end of the string
if [ $(LANG=C expr "_$word" : '_[!-~]*$') = 0 ]; then return 1; fi
part1=$(printf "$word" | $CUT -f 1 -d '?')
part5=$(printf "$word" | $CUT -f 5- -d '?')
if [ "$part1" != "=" -o "$part5" != "=" ]; then return 1; fi

# Extract charset, encoding, and encoded text
charset=$(printf "$word" | $CUT -f 2 -d '?')
encoding=$(printf "$word" | $CUT -f 3 -d '?')
encoded=$(printf "$word" | $CUT -f 4 -d '?')

case $encoding in
B | b)
decoded=$(printf "$encoded" | $DECODE_B64 2>/dev/null)
if [ $? != 0 ]; then return 1; fi
;;
Q | q)
decoded=$(printf "$encoded" | $TR "_" " " | $DECODE_QP 2>/dev/null)
if [ $? != 0 ]; then return 1; fi
;;
*)
return 1
;;
esac

VARY=$(printf "$decoded" | $ICONV -f $charset -t UTF-8 2>/dev/null)
return $?
}

#
# Function: add_word
#
# Description:
# Try to decode a new word, and update DECODED_LINE and STATUS
# depending on the result and the previous STATUS.
#
# Parameter:
# A single (possibly MIME-encoded) word.
#
# Return value:
# None
#
# Side effects:
# Change DECODED_LINE and STATUS
#
add_word()
{
word="$*"
prefix=""
suffix=""

# Manage possible initial and final parentheses
while [ $(LANG=C expr "_$word" : '_[()]') != 0 ]; do
prefix="${prefix}${word:0:1}"
word="${word:1}"
done
while [ $(LANG=C expr "_$word" : '_.*[()]$') != 0 ]; do
suffix="${word: -1}${suffix}"
word="${word:0: -1}"
done

if decode_word "$word"; then
word="${prefix}${VARY}${suffix}"
if [ "$STATUS" = "normal" ]; then
DECODED_LINE="${DECODED_LINE} "
elif [ "$STATUS" != "none" -a -n "${prefix}" ]; then
DECODED_LINE="${DECODED_LINE} "
fi
DECODED_LINE="${DECODED_LINE}${word}"
if [ -n "${suffix}" ]; then
STATUS="normal"
else
STATUS="decoded-word"
fi
else
word="${prefix}${word}${suffix}"
if [ "$STATUS" != "none" ]; then
DECODED_LINE="${DECODED_LINE} "
fi
DECODED_LINE="${DECODED_LINE}${word}"
STATUS="normal"
fi
}

#
# Function: flush_line
#
# Description:
# Before beginning to manage a new header, or just before ending
# the script, print the pending DECODED_LINE if any.
#
# Parameter:
# None
#
# Return value:
# None
#
# Side effects:
# Print things to stdout
# Change DECODED_LINE and STATUS
#
flush_line()
{
if [ -n "${DECODED_LINE}" ]; then
printf "%s\n" "${DECODED_LINE}"
DECODED_LINE=""
STATUS="none"
fi
}

#
# Function: manage_line
#
# Description:
# Manage a new line, which can be either the beginning or the
# continuation of a mail/news header.
# This function prints the previous line if this is a new one,
# then it adds successive parts to the new DECODED_LINE, while
# updating STATUS as needed.
#
# Parameter:
# An input line.
#
# Return value:
# None
#
# Side effects:
# Print things to stdout
# Change DECODED_LINE and STATUS
#
manage_line()
{
line="$*"

# Is it a continuation line?
if [ $(LANG=C expr "_$line" : "_[ \t]") = 0 ]; then
# No: new header
flush_line
fi

for word in $line; do
add_word "$word"
done
}

#
# Function: manage_file
#
# Description:
# Call manage_line for each line in a given file
#
# Parameter:
# A file name, or "-" for stdin
#
# Return value:
# None
#
# Side effects:
# Same as manage_line
#
manage_file()
{
file=${1--} # POSIX-compliant; ${1:--} can be used either.
while IFS= read -r line; do
manage_line "$line"
done < <(cat -- "$file")
}

#
# Parse arguments for -h or --help
#
for i in "$@"; do
case $i in
-h | --help)
usage
;;
-)
;;
-*)
printf "Unknown argument '%s'\n" "$i"
usage
;;
esac
done

#
# Main loop.
# Call manage_file for each filename in parameters,
# then print the last pending DECODED_LINE if any.
#
if [ "$*" = "" ]; then
manage_file "-"
else
for file in "$@"; do
manage_file "$file"
done
fi
flush_line

exit 0
--
Olivier Miakinen

6 réponses

Avatar
M.V.
Olivier Miakinen a suggéré en date du 18 octobre 2019 :
#!/bin/bash

**********
MV$ echo "From: Arvo =?UTF-8?Q?Pärt?= " | /usr/local/bin/olivier2.sh
From: Arvo Pärt
**********
Ça a donc l'air de fonctionner. ;-)
Mais :
**********
MV$ echo "Organization: =?NF_Z_62-010_(1973)?Q?C'{tait_o|_d{??=" | /usr/local/bin/olivier2.sh
Organization: =?NF_Z_62-010_(1973)?Q?C'{tait_o|_d{?? **********
C'est donc raté pour ce charset (NB mon iconv ne reconnaît
pas ce charset).
Maintenant, comment je fais pour tester avec un texte du
genre de celui que tu as donné en exemple ?
--
Michel VAUQUOIS - http://michelvauquois.fr
Avatar
Elephant Man
Le 18/10/2019 à 11:24, Olivier Miakinen a écrit :
#!/bin/bash
####################################################################
# Name:
# decode_headers
#
# Description:
# Script used for decoding RFC 2047 MIME encoded headers.

Et sinon, tu connais le Python aussi un peu ? :)
Avatar
josephb
Olivier Miakinen <om+ wrote:
ICONV="/usr/bin/iconv"

Pour ton information "Mac", son iconv du bash, ou du zsh, (iconv -l) connaît les encodages suivants :
ANSI_X3.4-1968 ANSI_X3.4-1986 ASCII CP367 IBM367 ISO-IR-6 ISO646-US ISO_646.IRV:1991 US US-ASCII CSASCII
UTF-8 UTF8
UTF-8-MAC UTF8-MAC
ISO-10646-UCS-2 UCS-2 CSUNICODE
UCS-2BE UNICODE-1-1 UNICODEBIG CSUNICODE11
UCS-2LE UNICODELITTLE
ISO-10646-UCS-4 UCS-4 CSUCS4
UCS-4BE
UCS-4LE
UTF-16
UTF-16BE
UTF-16LE
UTF-32
UTF-32BE
UTF-32LE
UNICODE-1-1-UTF-7 UTF-7 CSUNICODE11UTF7
UCS-2-INTERNAL
UCS-2-SWAPPED
UCS-4-INTERNAL
UCS-4-SWAPPED
C99
JAVA
CP819 IBM819 ISO-8859-1 ISO-IR-100 ISO8859-1 ISO_8859-1 ISO_8859-1:1987 L1 LATIN1 CSISOLATIN1
ISO-8859-2 ISO-IR-101 ISO8859-2 ISO_8859-2 ISO_8859-2:1987 L2 LATIN2 CSISOLATIN2
ISO-8859-3 ISO-IR-109 ISO8859-3 ISO_8859-3 ISO_8859-3:1988 L3 LATIN3 CSISOLATIN3
ISO-8859-4 ISO-IR-110 ISO8859-4 ISO_8859-4 ISO_8859-4:1988 L4 LATIN4 CSISOLATIN4
CYRILLIC ISO-8859-5 ISO-IR-144 ISO8859-5 ISO_8859-5 ISO_8859-5:1988 CSISOLATINCYRILLIC
ARABIC ASMO-708 ECMA-114 ISO-8859-6 ISO-IR-127 ISO8859-6 ISO_8859-6 ISO_8859-6:1987 CSISOLATINARABIC
ECMA-118 ELOT_928 GREEK GREEK8 ISO-8859-7 ISO-IR-126 ISO8859-7 ISO_8859-7 ISO_8859-7:1987 ISO_8859-7:2003 CSISOLATINGREEK
HEBREW ISO-8859-8 ISO-IR-138 ISO8859-8 ISO_8859-8 ISO_8859-8:1988 CSISOLATINHEBREW
ISO-8859-9 ISO-IR-148 ISO8859-9 ISO_8859-9 ISO_8859-9:1989 L5 LATIN5 CSISOLATIN5
ISO-8859-10 ISO-IR-157 ISO8859-10 ISO_8859-10 ISO_8859-10:1992 L6 LATIN6 CSISOLATIN6
ISO-8859-11 ISO8859-11 ISO_8859-11
ISO-8859-13 ISO-IR-179 ISO8859-13 ISO_8859-13 L7 LATIN7
ISO-8859-14 ISO-CELTIC ISO-IR-199 ISO8859-14 ISO_8859-14 ISO_8859-14:1998 L8 LATIN8
ISO-8859-15 ISO-IR-203 ISO8859-15 ISO_8859-15 ISO_8859-15:1998 LATIN-9
ISO-8859-16 ISO-IR-226 ISO8859-16 ISO_8859-16 ISO_8859-16:2001 L10 LATIN10
KOI8-R CSKOI8R
KOI8-U
KOI8-RU
CP1250 MS-EE WINDOWS-1250
CP1251 MS-CYRL WINDOWS-1251
CP1252 MS-ANSI WINDOWS-1252
CP1253 MS-GREEK WINDOWS-1253
CP1254 MS-TURK WINDOWS-1254
CP1255 MS-HEBR WINDOWS-1255
CP1256 MS-ARAB WINDOWS-1256
CP1257 WINBALTRIM WINDOWS-1257
CP1258 WINDOWS-1258
850 CP850 IBM850 CSPC850MULTILINGUAL
862 CP862 IBM862 CSPC862LATINHEBREW
866 CP866 IBM866 CSIBM866
MAC MACINTOSH MACROMAN CSMACINTOSH
MACCENTRALEUROPE
MACICELAND
MACCROATIAN
MACROMANIA
MACCYRILLIC
MACUKRAINE
MACGREEK
MACTURKISH
MACHEBREW
MACARABIC
MACTHAI
HP-ROMAN8 R8 ROMAN8 CSHPROMAN8
NEXTSTEP
ARMSCII-8
GEORGIAN-ACADEMY
GEORGIAN-PS
KOI8-T
CP154 CYRILLIC-ASIAN PT154 PTCP154 CSPTCP154
MULELAO-1
CP1133 IBM-CP1133
ISO-IR-166 TIS-620 TIS620 TIS620-0 TIS620.2529-1 TIS620.2533-0 TIS620.2533-1
CP874 WINDOWS-874
VISCII VISCII1.1-1 CSVISCII
TCVN TCVN-5712 TCVN5712-1 TCVN5712-1:1993
ISO-IR-14 ISO646-JP JIS_C6220-1969-RO JP CSISO14JISC6220RO
JISX0201-1976 JIS_X0201 X0201 CSHALFWIDTHKATAKANA
ISO-IR-87 JIS0208 JIS_C6226-1983 JIS_X0208 JIS_X0208-1983 JIS_X0208-1990 X0208 CSISO87JISX0208
ISO-IR-159 JIS_X0212 JIS_X0212-1990 JIS_X0212.1990-0 X0212 CSISO159JISX02121990
CN GB_1988-80 ISO-IR-57 ISO646-CN CSISO57GB1988
CHINESE GB_2312-80 ISO-IR-58 CSISO58GB231280
CN-GB-ISOIR165 ISO-IR-165
ISO-IR-149 KOREAN KSC_5601 KS_C_5601-1987 KS_C_5601-1989 CSKSC56011987
EUC-JP EUCJP EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE CSEUCPKDFMTJAPANESE
MS_KANJI SHIFT-JIS SHIFT_JIS SJIS CSSHIFTJIS
CP932
ISO-2022-JP CSISO2022JP
ISO-2022-JP-1
ISO-2022-JP-2 CSISO2022JP2
CN-GB EUC-CN EUCCN GB2312 CSGB2312
GBK
CP936 MS936 WINDOWS-936
GB18030
ISO-2022-CN CSISO2022CN
ISO-2022-CN-EXT
HZ HZ-GB-2312
EUC-TW EUCTW CSEUCTW
BIG-5 BIG-FIVE BIG5 BIGFIVE CN-BIG5 CSBIG5
CP950
BIG5-HKSCS:1999
BIG5-HKSCS:2001
BIG5-HKSCS BIG5-HKSCS:2004 BIG5HKSCS
EUC-KR EUCKR CSEUCKR
CP949 UHC
CP1361 JOHAB
ISO-2022-KR CSISO2022KR
CP856
CP922
CP943
CP1046
CP1124
CP1129
CP1161 IBM-1161 IBM1161 CSIBM1161
CP1162 IBM-1162 IBM1162 CSIBM1162
CP1163 IBM-1163 IBM1163 CSIBM1163
DEC-KANJI
DEC-HANYU
437 CP437 IBM437 CSPC8CODEPAGE437
CP737
CP775 IBM775 CSPC775BALTIC
852 CP852 IBM852 CSPCP852
CP853
855 CP855 IBM855 CSIBM855
857 CP857 IBM857 CSIBM857
CP858
860 CP860 IBM860 CSIBM860
861 CP-IS CP861 IBM861 CSIBM861
863 CP863 IBM863 CSIBM863
CP864 IBM864 CSIBM864
865 CP865 IBM865 CSIBM865
869 CP-GR CP869 IBM869 CSIBM869
CP1125
EUC-JISX0213
SHIFT_JISX0213
ISO-2022-JP-3
BIG5-2003
ISO-IR-230 TDS565
ATARI ATARIST
RISCOS-LATIN1
--
J. B.
Avatar
Olivier Miakinen
Le 18/10/2019 11:41, Elephant Man a écrit :
Le 18/10/2019 à 11:24, Olivier Miakinen a écrit :
#!/bin/bash
####################################################################
# Name:
# decode_headers
#
# Description:
# Script used for decoding RFC 2047 MIME encoded headers.

Et sinon, tu connais le Python aussi un peu ? :)

Très peu, mais j'ai déjà écrit quelques scripts qui fonctionnent en
lisant la doc. C'est vrai que je devrais peut-être essayer d'écrire
tout ça en python plutôt qu'en bash, déjà ça aurait plus de chances
de donner les mêmes résultats sur Mac que sur GNU/Linux (et pourquoi
pas Windows, soyons fou).
--
Olivier Miakinen
Avatar
josephb
Olivier Miakinen <om+ écrivit :
C'est vrai que je devrais peut-être essayer d'écrire
tout ça en python plutôt qu'en bash, déjà ça aurait plus de chances
de donner les mêmes résultats sur Mac que sur GNU/Linux (et pourquoi
pas Windows, soyons fou).

Partant sans doute du principe que c'est dans les vieux pots qu'on fait
la bonne soupe, les Python, Ruby ou Perl livrés avec l'OS X courant ont
toujours plusieurs versions de retard.
Et à partir de Catalina, Apple ne livrera plus ces langages, il faudra
se les installer soi-même.
--
J. B.
Avatar
Olivier Miakinen
Le 18/10/2019 14:54, Joseph-B a écrit :
Olivier Miakinen <om+ wrote:
ICONV="/usr/bin/iconv"

Pour ton information "Mac", son iconv du bash, ou du zsh, (iconv -l) connaît les encodages suivants :
[...]

Merci pour ça. Et pour ta propre information, la transformation que me fait
le NF_Z_62-010_(1973) semble correspondre exactement à ce que faisait le
petit switch sur un Apple II que j'avais dans les années 1980, basculant
entre US-ASCII et une version francisée !
--
Olivier Miakinen