The Battle for Wesnoth  1.17.23+dev
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2023
3  by Philippe Plantier <ayin@anathas.org>
4  Copyright (C) 2005 by Guillaume Melquiond <guillaume.melquiond@gmail.com>
5  Copyright (C) 2003 by David White <dave@whitevine.net>
6  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY.
14 
15  See the COPYING file for more details.
16 */
17 
18 /**
19  * @file
20  * Unicode support functions.
21  */
22 
26 
27 #include "log.hpp"
28 
29 #include <cassert>
30 #include <limits>
31 
32 static lg::log_domain log_engine("engine");
33 #define ERR_GENERAL LOG_STREAM(err, lg::general())
34 
35 namespace utf8 {
36 
37 static int byte_size_from_utf8_first(const unsigned char ch)
38 {
39  if (!(ch & 0x80)) {
40  return 1; // US-ASCII character, 1 byte
41  }
42  /* first bit set: character not in US-ASCII, multiple bytes
43  * number of set bits at the beginning = bytes per character
44  * e.g. 11110xxx indicates a 4-byte character */
45  int count = count_leading_ones(ch);
46  if (count == 1 || count > 6) { // count > 4 after RFC 3629
47  throw invalid_utf8_exception(); // Stop on invalid characters
48  }
49  return count;
50 }
51 
52 std::string lowercase(const std::string& s)
53 {
54  if(!s.empty()) {
55  utf8::iterator itor(s);
56  std::string res;
57 
58  for(;itor != utf8::iterator::end(s); ++itor) {
59  char32_t uchar = *itor;
60  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
61  if(uchar <= static_cast<char32_t>(std::numeric_limits<wchar_t>::max()))
62  uchar = towlower(static_cast<wchar_t>(uchar));
63  res += unicode_cast<std::string>(uchar);
64  }
65 
66  res.append(itor.substr().second, s.end());
67  return res;
68  }
69  return s;
70 }
71 
72 std::size_t index(const std::string& str, const std::size_t index)
73 {
74  // chr counts characters, i is the codepoint index
75  // remark: several functions rely on the fallback to str.length()
76  unsigned int i = 0, len = str.size();
77  try {
78  for (unsigned int chr=0; chr<index && i<len; ++chr) {
79  i += byte_size_from_utf8_first(str[i]);
80  }
81  } catch(const invalid_utf8_exception&) {
82  ERR_GENERAL << "Invalid UTF-8 string.";
83  }
84  return i;
85 }
86 
87 std::size_t size(const std::string& str)
88 {
89  unsigned int chr, i = 0, len = str.size();
90  try {
91  for (chr=0; i<len; ++chr) {
92  i += byte_size_from_utf8_first(str[i]);
93  }
94  } catch(const invalid_utf8_exception&) {
95  ERR_GENERAL << "Invalid UTF-8 string.";
96  }
97  return chr;
98 }
99 
100 std::string& insert(std::string& str, const std::size_t pos, const std::string& insert)
101 {
102  return str.insert(index(str, pos), insert);
103 }
104 
105 std::string& erase(std::string& str, const std::size_t start, const std::size_t len)
106 {
107  if (start > size(str)) return str;
108  unsigned pos = index(str, start);
109 
110  if (len == std::string::npos) {
111  // without second argument, std::string::erase truncates
112  return str.erase(pos);
113  } else {
114  return str.erase(pos, index(str,start+len) - pos);
115  }
116 }
117 
118 std::string& truncate(std::string& str, const std::size_t size)
119 {
120  return erase(str, size);
121 }
122 
123 void truncate_as_ucs4(std::string &str, const std::size_t size)
124 {
125  std::u32string u4_str = unicode_cast<std::u32string>(str);
126  if(u4_str.size() > size) {
127  u4_str.resize(size);
128  str = unicode_cast<std::string>(u4_str);
129  }
130 }
131 
132 } // end namespace utf8
static iterator_base end(const string_type &str)
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
Thrown by operations encountering invalid UTF-8 data.
std::size_t i
Definition: function.cpp:968
Standard logging facilities (interface).
constexpr unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit ...
Definition: math.hpp:179
EXIT_STATUS start(bool clear_id, const std::string &filename, bool take_screenshot, const std::string &screenshot_filename)
Main interface for launching the editor from the title screen.
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth.
Definition: unicode.cpp:35
std::size_t index(const std::string &str, const std::size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:72
std::string & insert(std::string &str, const std::size_t pos, const std::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:100
std::string lowercase(const std::string &s)
Returns a lowercased version of the string.
Definition: unicode.cpp:52
std::string & erase(std::string &str, const std::size_t start, const std::size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:105
void truncate_as_ucs4(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:123
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:87
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:37
std::string & truncate(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:118
static map_location::DIRECTION s
#define ERR_GENERAL
Definition: unicode.cpp:33
static lg::log_domain log_engine("engine")