unit uCode; interface function gfniCodePageGet(pBuff: PAnsiChar; iCount: Integer): Integer; implementation uses Windows, ActiveX; //------------------------------------------------------------------------------ //文字コードの自動判定 //http://mrxray.on.coocan.jp/Delphi/plSamples/886_ChangeCodePage.htm //http://www.delphikingdom.com/asp/answer.asp?IDAnswer=16895 //https://groups.google.com/group/delphicbuilder-ml-archive/msg/9c9767ff7e738d3a?hl=ja& const IID_IMultiLanguage2 : TGUID = '{DCCFC164-2B38-11D2-B7EC-00C04F8F5D9A}'; CLASS_CMultiLanguage : TGUID = '{275C23E2-3747-11D0-9FEA-00AA003F8646}'; type tagMIMECPINFO = packed record dwFlags : LongWord; uiCodePage : SYSUINT; uiFamilyCodePage : SYSUINT; wszDescription : array[0..63] of Word; wszWebCharset : array[0..49] of Word; wszHeaderCharset : array[0..49] of Word; wszBodyCharset : array[0..49] of Word; wszFixedWidthFont : array[0..31] of Word; wszProportionalFont : array[0..31] of Word; bGDICharset : Byte; end; tagMIMECSETINFO = packed record uiCodePage : SYSUINT; uiInternetEncoding : SYSUINT; wszCharset : array[0..49] of Word; end; tagRFC1766INFO = packed record lcid : LongWord; wszRfc1766 : array[0..5] of Word; wszLocaleName : array[0..31] of Word; end; tagDetectEncodingInfo = packed record nLangID : SYSUINT; nCodePage : SYSUINT; nDocPercent : SYSINT; nConfidence : SYSINT; end; tagSCRIPTINFO = packed record ScriptId : Byte; uiCodePage : SYSUINT; wszDescription : array[0..47] of Word; wszFixedWidthFont : array[0..31] of Word; wszProportionalFont : array[0..31] of Word; end; __MIDL_IWinTypes_0009 = record case Integer of 0 : (hInproc : Integer); 1 : (hRemote : Integer); end; _RemotableHandle = packed record fContext : Integer; u : __MIDL_IWinTypes_0009; end; // Constants for enum tagMIMECONTF //type tagMIMECONTF = TOleEnum; // *********************************************************************// // Interface: IEnumCodePage // Flags: (0) // GUID: {275C23E3-3747-11D0-9FEA-00AA003F8646} // *********************************************************************// IEnumCodePage = interface(IUnknown) ['{275C23E3-3747-11D0-9FEA-00AA003F8646}'] function Clone(out ppEnum: IEnumCodePage): HResult; stdcall; function Next(celt: LongWord; out rgelt: tagMIMECPINFO; out pceltFetched: LongWord): HResult; stdcall; function Reset: HResult; stdcall; function Skip(celt: LongWord): HResult; stdcall; end; // *********************************************************************// // Interface: IEnumRfc1766 // Flags: (0) // GUID: {3DC39D1D-C030-11D0-B81B-00C04FC9B31F} // *********************************************************************// IEnumRfc1766 = interface(IUnknown) ['{3DC39D1D-C030-11D0-B81B-00C04FC9B31F}'] function Clone(out ppEnum: IEnumRfc1766): HResult; stdcall; function Next(celt: LongWord; out rgelt: tagRFC1766INFO; out pceltFetched: LongWord): HResult; stdcall; function Reset: HResult; stdcall; function Skip(celt: LongWord): HResult; stdcall; end; // *********************************************************************// // Interface: IMLangConvertCharset // Flags: (0) // GUID: {D66D6F98-CDAA-11D0-B822-00C04FC9B31F} // *********************************************************************// IMLangConvertCharset = interface(IUnknown) ['{D66D6F98-CDAA-11D0-B822-00C04FC9B31F}'] function Initialize(uiSrcCodePage: SYSUINT; uiDstCodePage: SYSUINT; dwProperty: LongWord): HResult; stdcall; function GetSourceCodePage(out puiSrcCodePage: SYSUINT): HResult; stdcall; function GetDestinationCodePage(out puiDstCodePage: SYSUINT): HResult; stdcall; function GetProperty(out pdwProperty: LongWord): HResult; stdcall; //***** // function DoConversion(var pSrcStr: Byte; var pcSrcSize: SYSUINT; var pDstStr: Byte; // var pcDstSize: SYSUINT): HResult; stdcall; function DoConversion( pSrcStr : PAnsiChar; var pcSrcSize : SYSUINT; pDstStr : PAnsiChar; var pcDstSize : SYSUINT ) : HResult; stdcall; function DoConversionToUnicode(var pSrcStr: Shortint; var pcSrcSize: SYSUINT; var pDstStr: Word; var pcDstSize: SYSUINT): HResult; stdcall; function DoConversionFromUnicode(var pSrcStr: Word; var pcSrcSize: SYSUINT; var pDstStr: Shortint; var pcDstSize: SYSUINT): HResult; stdcall; end; // *********************************************************************// // Interface: IEnumScript // Flags: (0) // GUID: {AE5F1430-388B-11D2-8380-00C04F8F5DA1} // *********************************************************************// IEnumScript = interface(IUnknown) ['{AE5F1430-388B-11D2-8380-00C04F8F5DA1}'] function Clone(out ppEnum: IEnumScript): HResult; stdcall; function Next(celt: LongWord; out rgelt: tagSCRIPTINFO; out pceltFetched: LongWord): HResult; stdcall; function Reset: HResult; stdcall; function Skip(celt: LongWord): HResult; stdcall; end; // *********************************************************************// // Interface: IMultiLanguage2 // Flags: (0) // GUID: {DCCFC164-2B38-11D2-B7EC-00C04F8F5D9A} // *********************************************************************// IMultiLanguage2 = interface(IUnknown) ['{DCCFC164-2B38-11D2-B7EC-00C04F8F5D9A}'] function GetNumberOfCodePageInfo(out pcCodePage: SYSUINT): HResult; stdcall; function GetCodePageInfo(uiCodePage: SYSUINT; LangId: Word; out pCodePageInfo: tagMIMECPINFO): HResult; stdcall; function GetFamilyCodePage(uiCodePage: SYSUINT; out puiFamilyCodePage: SYSUINT): HResult; stdcall; function EnumCodePages(grfFlags: LongWord; LangId: Word; out ppEnumCodePage: IEnumCodePage): HResult; stdcall; function GetCharsetInfo(const Charset: WideString; out pCharsetInfo: tagMIMECSETINFO): HResult; stdcall; function IsConvertible(dwSrcEncoding: LongWord; dwDstEncoding: LongWord): HResult; stdcall; //***** // function ConvertString(var pdwMode: LongWord; dwSrcEncoding: LongWord; dwDstEncoding: LongWord; // var pSrcStr: Byte; var pcSrcSize: SYSUINT; var pDstStr: Byte; // var pcDstSize: SYSUINT): HResult; stdcall; function ConvertString( var pdwMode : LongWord; dwSrcEncoding : LongWord; dwDstEncoding : LongWord; var pSrcStr : Byte; var pcSrcSize : SYSUINT; var pDstStr : Byte; var pcDstSize : SYSUINT ) : HResult; stdcall; //***** // function ConvertStringToUnicode(var pdwMode: LongWord; dwEncoding: LongWord; // var pSrcStr: Shortint; var pcSrcSize: SYSUINT; // var pDstStr: Word; var pcDstSize: SYSUINT): HResult; stdcall; function ConvertStringToUnicode( var pdwMode : DWORD; dwEncoding : DWORD; pSrcStr : PAnsiChar; var pcSrcSize : SYSUINT; pDstStr : PWideChar; var pcDstSize : SYSUINT ) : HResult; stdcall; //***** // function ConvertStringFromUnicode(var pdwMode: LongWord; dwEncoding: LongWord; // var pSrcStr: Word; var pcSrcSize: SYSUINT; // var pDstStr: Shortint; var pcDstSize: SYSUINT): HResult; stdcall; function ConvertStringFromUnicode( var pdwMode : DWORD; dwEncoding : DWORD; pSrcStr : PWideChar; var pcSrcSize : SYSUINT; pDstStr : PAnsiChar; var pcDstSize : SYSUINT ) : HResult; stdcall; function ConvertStringReset: HResult; stdcall; function GetRfc1766FromLcid(locale: LongWord; out pbstrRfc1766: WideString): HResult; stdcall; function GetLcidFromRfc1766(out plocale: LongWord; const bstrRfc1766: WideString): HResult; stdcall; function EnumRfc1766(LangId: Word; out ppEnumRfc1766: IEnumRfc1766): HResult; stdcall; function GetRfc1766Info(locale: LongWord; LangId: Word; out pRfc1766Info: tagRFC1766INFO): HResult; stdcall; function CreateConvertCharset(uiSrcCodePage: SYSUINT; uiDstCodePage: SYSUINT; dwProperty: LongWord; out ppMLangConvertCharset: IMLangConvertCharset): HResult; stdcall; function ConvertStringInIStream(var pdwMode: LongWord; dwFlag: LongWord; var lpFallBack: Word; dwSrcEncoding: LongWord; dwDstEncoding: LongWord; const pstmIn: ISequentialStream; const pstmOut: ISequentialStream): HResult; stdcall; function ConvertStringToUnicodeEx(var pdwMode: LongWord; dwEncoding: LongWord; var pSrcStr: Shortint; var pcSrcSize: SYSUINT; var pDstStr: Word; var pcDstSize: SYSUINT; dwFlag: LongWord; var lpFallBack: Word): HResult; stdcall; function ConvertStringFromUnicodeEx(var pdwMode: LongWord; dwEncoding: LongWord; var pSrcStr: Word; var pcSrcSize: SYSUINT; var pDstStr: Shortint; var pcDstSize: SYSUINT; dwFlag: LongWord; var lpFallBack: Word): HResult; stdcall; function DetectCodepageInIStream(dwFlag: LongWord; dwPrefWinCodePage: LongWord; const pstmIn: ISequentialStream; var lpEncoding: tagDetectEncodingInfo; var pnScores: SYSINT): HResult; stdcall; //***** // function DetectInputCodepage(dwFlag: LongWord; dwPrefWinCodePage: LongWord; // var pSrcStr: Shortint; var pcSrcSize: SYSINT; // var lpEncoding: tagDetectEncodingInfo; var pnScores: SYSINT): HResult; stdcall; function DetectInputCodepage( dwFlag : DWORD; dwPrefWinCodePage : DWORD; pSrcStr : PAnsiChar; var pcSrcSize : SYSINT; var lpEncoding : tagDetectEncodingInfo; var pnScores : SYSINT ) : HResult; stdcall; function ValidateCodePage(uiCodePage: SYSUINT; var hwnd: _RemotableHandle): HResult; stdcall; function GetCodePageDescription(uiCodePage: SYSUINT; lcid: LongWord; lpWideCharStr: PWideChar; cchWideChar: SYSINT): HResult; stdcall; function IsCodePageInstallable(uiCodePage: SYSUINT): HResult; stdcall; function SetMimeDBSource(dwSource: tagMIMECONTF): HResult; stdcall; function GetNumberOfScripts(out pnScripts: SYSUINT): HResult; stdcall; function EnumScripts(dwFlags: LongWord; LangId: Word; out ppEnumScript: IEnumScript): HResult; stdcall; function ValidateCodePageEx(uiCodePage: SYSUINT; var hwnd: _RemotableHandle; dwfIODControl: LongWord): HResult; stdcall; end; (* function gfnCodePageToCharCode(iCodePage: Integer): TMyCharCode; //http://www.xmleditor.jp/blog/archives/40 {2010-10-02: コードページからそれに対応する文字コードを返す } begin case iCodePage of 932: Result := cdSJis; //日本語 (シフト JIS) shift_jis 1200: Result := cdUTF_16LE; //Unicode utf-16 1201: Result := cdUTF_16BE; //Unicode (Big-Endian) unicodeFFFE 10001: Result := cdSJis; //日本語 (Mac) x-mac-japanese 20127: Result := cdSJis; //US-ASCII us-ascii 50220: Result := cdJIS; //日本語 (JIS) iso-2022-jp 50221: Result := cdJIS; //日本語 (JIS 1 バイト カタカナ可) csISO2022JP 50222: Result := cdJIS; //日本語 (JIS 1 バイト カタカナ可 - SO/SI) iso-2022-jp 50225: Result := cdJIS; //韓国語 (ISO) iso-2022-kr 50227: Result := cdJIS; //簡体字中国語 (ISO-2022) x-cp50227 20932: Result := cdEUC; //日本語 (JIS 0208-1990 および 0212-1990) EUC-JP 51932: Result := cdEUC; //日本語 (EUC) euc-jp 51936: Result := cdEUC; //簡体字中国語 (EUC) EUC-CN 51949: Result := cdEUC; //韓国語 (EUC) euc-kr 65000: Result := cdUTF_7; 65001: Result := cdUTF_8N; else Result := cdSJis; end; end; *) //function gfnCharCodeGet(pBuff: PAnsiChar; iCount: Integer): TMyCharCode; function gfniCodePageGet(pBuff: PAnsiChar; iCount: Integer): Integer; var l_IMultiLanguage : IMultiLanguage2; l_Encoding : tagDetectEncodingInfo; li_Source : SYSINT; lb_LE : Boolean; lb_BE : Boolean; i : Integer; begin Result := 1252; if (Succeeded(CoCreateInstance( CLASS_CMultiLanguage, //オブジェクトのCLSID nil, //複数オブジェクトの一部の時はIUnknownインターフェイスのポインタ CLSCTX_INPROC_SERVER, //管理コードを実行するコンテキスト IID_IMultiLanguage2, //取得するIID l_IMultiLanguage //生成オブジェクトの変数アドレス ))) then begin li_Source := 1; if (Succeeded(l_IMultiLanguage.DetectInputCodepage(0, 0, pBuff, iCount, l_Encoding, li_Source))) then begin //BOMなしのUTF-16は正しく判定できないようだ。 Result := l_Encoding.nCodePage; end; l_IMultiLanguage := nil; end; if (Result = 1252) then begin lb_LE := False; lb_BE := False; for i := 0 to iCount-1 do begin if (pBuff[i] = #0) then begin //#0があったのでUTF-16とする。 //半角の英数記号の上位バイトは0x00なので#0があればUTF-16と判定するというかなり適当な処理。 if (Odd(i)) then begin //リトルエンデアン if (lb_BE) then begin //既にビッグエンディアンとして推定していた→UTF-16ではない(おそらくバイナリデータ) lb_BE := False; Break; end; lb_LE := True; end else begin //ビッグエンディアン if (lb_LE) then begin //既にリトルエンディアンとして推定していた→UTF-16ではない(おそらくバイナリデータ) lb_LE := False; Break; end; lb_BE := True; end; end; end; if (lb_LE or lb_BE) then begin if (lb_LE) then begin Result := 1200; end else if (lb_BE) then begin Result := 1201; end; end; end; end; end.