?print-pdf
' Created for
низ = "This is a normal Python string :ছ 𝄞 ☕"
print(низ)
# This is a normal Python string :ছ 𝄞 ☕
ord()
and chr()
functionsord(char)
- return an integer representing the Unicode code point of char given.chr(i)
- return the string representing a character whose Unicode code point is the integer i
print( ord('я') )
# 1103
print( chr(1103) )
# я
# Unicode symbol in string:
print("Ѣ")
# Using the character name:
print("\N{Cyrillic Capital Letter Yat}")
# Using a 16-bit hex value code point:
print("\u0462")
# Using a 32-bit hex value code point:
print("\U00000462")
str.encode()
- syntax
str.encode(encoding="utf-8", errors="strict")
str.encode()
- example
string = "123абв"
bytes_string = string.encode()
print("Byte object:", bytes_string)
print("Type: ",type(bytes_string) )
print("Length:",len(bytes_string) )
#Byte object: b'123\xd0\xb0\xd0\xb1\xd0\xb2'
#Type: <class 'bytes'>
#Length: 9
Note, that the len() of byte object returns the number of bytes, not the number of characters encoded!
text = 'Здравей, свят!'
try:
ascii_encoded = text.encode('ascii')
print("ASCII Encoded:", ascii_encoded)
except UnicodeEncodeError as e:
print("Error:", e)
# Error: 'ascii' codec can't encode characters in position 0-6: ordinal not in range(128)
bytes.decode()
- syntax
bytes.decode(encoding="utf-8", errors="strict")
bytes.decode()
- example
byte_string = b'\xd0\xb0\xd0\xb1\xd0\xb2'
string = byte_string.decode()
print("String object:", string)
print("Type: ",type(string) )
print("String length:",len(string) )
print("Byte_string length:",len(byte_string) )
# String object: абв
# Type: <class 'str'>
# String length: 3
# Byte_string length: 6
bytes object
bytes object
bytes()
constructor or by using a bytes literal prefixed with b
(bytes string).
# Define a byte string
byte_string = b'Hello'
print("Byte String:", byte_string)
# Create a bytes object with the same bytes as in byte string above
byte_data = bytes([72, 101, 108, 108, 111])
print("Bytes Object:", byte_data)
# Check if both objects contain the same bytes
if byte_string == byte_data:
print("The byte string and the bytes object contain the same sequence of bytes.")
else:
print("The byte string and the bytes object do not contain the same sequence of bytes.")
# Byte String: b'Hello'
# Bytes Object: b'Hello'
# The byte string and the bytes object contain the same sequence of bytes.
string = "Петър плет плете"
# open a file for writing in text mode, with encoding="cp1251" "
with open("write_to_cp1251.txt", "w", encoding="cp1251") as fh:
fh.write(string)
encode()
method
string = "Петър плет плете"
# open a file handler for writing in binary mode"
with open("encode_to_cp1251.txt", "w+b") as fh:
bytes_sequence = string.encode(encoding="cp1251")
fh.write(bytes_sequence)
filename = "write_to_cp1251.txt"
# open a file handler for reading in text mode, with encoding="cp1251""
with open(filename, "r", encoding="cp1251") as f:
print(f.read())
# Петър плет плете
decode()
method
filename = "write_to_cp1251.txt"
# open a file handler for reading in binary mode"
with open(filename, "r+b") as f:
bytestring = f.read()
decoded_string = bytestring.decode(encoding="cp1251")
print(decoded_string)
# Петър плет плете
koi8r_to_utf8.py
, which will receive an input file name as argument and will create an UTF encoded file with the same name, but with sufix "_utf8_" added (quotes_utf8_.txt).
.
├── koi8r_to_utf8.py
└── quotes.txt
$ python koi8r_to_utf8.py quotes.txt
.
├── koi8r_to_utf8.py
├── quotes.txt
└── quotes_utf8_.txt
Make sure, that quotes_utf8_.txt is properly converted and readable!
These slides are based on
customised version of
framework