Question

我们有一个网站，我们使用ISO-8859-1，并希望转向UTF-8。它是用PHP制作的，过程很简单，文档很好。

在我们的案例中，由于我们在不同国家/地区设有此网站，因此我们只想在一个国家/地区进行尝试。我们这样做了很多次。我们遵循的结构非常简单：分支代码主干并将de branches代码部署到生产中。为了保持分支更新，我们只需将de更改从trunk更改为branch，直到我们重新集成并关闭此功能分支。

我们想在其他国家/地区对其进行测试，以便在出错时减少影响。

任何其他类型的更改都可以很好地工作，但在这种情况下，在转移到UTF-8之后，我将无法对分支执行合并主干更改以使其保持最新。

我一直试图找到与此相关的内容但没有成功。

你知道有没有办法在不同的字符集之间进行合并？

非常感谢， Grego

Answer 1

我遇到了同样的问题，我通过以下方式解决了这个问题：

使用chardet包pip install chardet安装python3。安装diff3 util，在Windows上你可以从MinGW获得它。

编辑svn配置文件（在Windows％APPDATA％\ Subversion \ config上）

[helpers]
diff3-cmd = C:\\diff3wrap.bat # note \\ instead of \

C：\ diff3wrap.bat

@echo off
SETLOCAL ENABLEEXTENSIONS
set pythondir=path\to\python3\dir
set mingwdir=path\to\diff3\dir

set pythonpath=%pythondir%lib\site-packages\;%pythonpath%
set path=%pythondir%;%mingwdir%;%path%
rem svn pass to diff3-cmd arguments suitable for diff3 util
rem e.g. -E -m -L .working -L .merge-left.r5 -L .merge-right.r6 path\to\temp\local\file path\to\temp\base\file path\to\temp\remote\file
python C:\diff3.py %*

C：\ diff3.py

#!python3
import codecs
import sys
from subprocess import Popen, PIPE

from chardet.langcyrillicmodel import Ibm866Model, Win1251CyrillicModel
from chardet.sbcharsetprober import SingleByteCharSetProber
from chardet.universaldetector import UniversalDetector
from chardet.utf8prober import UTF8Prober

detector = UniversalDetector()
# leave only necessary probers in order to speed up encoding detection
detector._mCharSetProbers = [ # in new chardet use _charset_probers
    UTF8Prober(),
    SingleByteCharSetProber(Ibm866Model),
    SingleByteCharSetProber(Win1251CyrillicModel)]


def detect_encoding(file_path):
    detector.reset()
    for line in open(file_path, 'rb'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    encoding = detector.result["encoding"]
    # treat ascii files as utf-8
    return 'utf-8' if encoding == 'ascii' else encoding


def iconv(file_path, from_encoding, to_encoding):
    if from_encoding == to_encoding:
        return
    with codecs.open(file_path, 'r', from_encoding) as i:
        text = i.read()
    write_to_file(file_path, text, to_encoding)


def write_to_file(file_path, text, to_encoding):
    with codecs.open(file_path, 'bw') as o:
        write_bytes_to_stream(o, text, to_encoding)


def write_bytes_to_stream(stream, text, to_encoding):
    # if you want BOM in your files you should add it by hand
    if to_encoding == "UTF-16LE":
        stream.write(codecs.BOM_UTF16_LE)
    elif to_encoding == "UTF-16BE":
        stream.write(codecs.BOM_UTF16_BE)
    stream.write(text.encode(to_encoding, 'ignore'))


def main():
    # in tortoise svn when press 'merge' button in commit dialog, some arguments are added that diff3 tool doesn't know
    for f in ['--ignore-eol-style', '-w']:
        if f in sys.argv:
            sys.argv.remove(f)

    # ['diff3.py', '-E', '-m', '-L', '.working', '-L', '.merge-left.r5',  '-L', '.merge-right.r6',
    # 'local_path', 'base_path', 'remote_path']
    local_path = sys.argv[-3]
    local_encoding = detect_encoding(local_path)

    base_path = sys.argv[-2]
    base_encoding = detect_encoding(base_path)

    remote_path = sys.argv[-1]
    remote_encoding = detect_encoding(remote_path)

    # diff3 doesn't work with utf-16 that's why you have to convert all files to utf-8
    aux_encoding = 'utf-8'
    iconv(local_path, local_encoding, aux_encoding)
    iconv(base_path, base_encoding, aux_encoding)
    iconv(remote_path, remote_encoding, aux_encoding)

    sys.argv[0] = 'diff3'
    p = Popen(sys.argv, stdout=PIPE, stderr=sys.stderr)
    stdout = p.communicate()[0]
    result_text = stdout.decode(aux_encoding)
    write_bytes_to_stream(sys.stdout.buffer, result_text, local_encoding)

    # in case of conflict svn copy temp base file and temp remote file next to your file in working copy
    # with names like your_file_with_conflict.merge-left.r5 and your_file_with_conflict.merge-right.r6
    # if you resolve conflicts using merge tool, it will use this files
    # if this files and file in your working copy have different encodings,
    # then after conflict resolution your working file change encoding and this is bad
    # that's why you have to convert temp files to local file encoding
    iconv(base_path, aux_encoding, local_encoding)
    iconv(remote_path, aux_encoding, local_encoding)
    sys.exit(p.returncode)


if __name__ == '__main__':
    main()

SVN使用不同的字符集合并分支

1 个答案: