Форум по Delphi программированию

Delphi Sources



Вернуться   Форум по Delphi программированию > Все о Delphi > [ "Начинающим" ]
Ник
Пароль
Регистрация <<         Правила форума         >> FAQ Пользователи Календарь Поиск Сообщения за сегодня Все разделы прочитаны

Ответ
 
Опции темы Поиск в этой теме Опции просмотра
  #1  
Старый 01.03.2007, 13:49
Аватар для 4kusNick
4kusNick 4kusNick вне форума
Местный
 
Регистрация: 06.09.2006
Адрес: Россия, Санкт-Петербург
Сообщения: 444
Репутация: 550
Стрелка

Так:
Код:
type
  TCodePage = (cpWin1251, cp866, cpKOI8R);
  PMap = ^TMap;
  TMap = array[#$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу перекодировки из CP в Windows1251
(nil для CP = cpWin1251) }
begin
  GetMap := nil;
end;

function DetermineRussian(Buf: PChar; Count: Integer): TCodePage;
const
  ModelBigrams: array[0..33, 0..33] of Byte = (
    {АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_?}
    {А}(0, 20, 44, 12, 22, 23, 16, 60, 4, 9, 63, 93, 47, 110, 0, 16, 35, 61, 81,
      1, 5, 13, 24, 17, 12, 4, 0, 0, 0, 0, 14, 31, 205, 1),
    {Б}(19, 0, 0, 0, 4, 19, 0, 0, 8, 0, 2, 15, 1, 4, 41, 0, 15, 5, 0, 15, 0, 2,
      1, 0, 0, 6, 16, 37, 0, 0, 0, 4, 3, 0),
    {В}(97, 0, 1, 0, 2, 57, 0, 5, 40, 0, 4, 25, 2, 23, 78, 2, 8, 28, 4, 12, 0,
      1, 0, 0, 8, 1, 0, 40, 1, 0, 0, 5, 106, 3),
    {Г}(13, 0, 0, 0, 9, 5, 0, 0, 15, 0, 1, 17, 1, 2, 96, 0, 24, 0, 0, 7, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0),
    {Д}(63, 0, 9, 1, 2, 71, 1, 0, 35, 0, 3, 16, 2, 22, 50, 2, 19, 9, 2, 25, 0,
      2, 1, 0, 1, 0, 1, 9, 4, 0, 1, 5, 17, 4),
    {Е}(4, 14, 15, 34, 56, 22, 13, 14, 2, 34, 39, 77, 73, 150, 6, 9, 101, 64,
      81, 1, 0, 15, 5, 12, 10, 6, 0, 0, 0, 0, 3, 4, 235, 1),
    {Ж}(13, 0, 0, 0, 12, 47, 0, 0, 16, 0, 1, 0, 0, 23, 0, 0, 0, 0, 0, 3, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2),
    {З}(76, 2, 11, 3, 11, 4, 1, 0, 7, 0, 2, 4, 11, 24, 17, 0, 6, 1, 0, 8, 0, 0,
      0, 0, 0, 0, 0, 16, 6, 0, 1, 4, 17, 0),
    {И}(7, 9, 32, 5, 18, 60, 4, 42, 31, 27, 28, 46, 55, 49, 12, 7, 26, 60, 53,
      0, 5, 25, 14, 28, 4, 1, 0, 0, 0, 0, 9, 56, 255, 0),
    {Й}(0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 10, 3, 0, 0, 0, 0, 1,
      1, 0, 0, 0, 0, 0, 0, 0, 122, 0),
    {К}(92, 0, 3, 0, 0, 7, 2, 1, 39, 0, 0, 27, 0, 14, 110, 0, 18, 5, 35, 18, 0,
      0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0),
    {Л}(85, 1, 0, 2, 1, 70, 6, 0, 85, 0, 5, 3, 0, 9, 67, 1, 0, 9, 0, 15, 0, 0,
      0, 2, 0, 0, 0, 9, 66, 0, 15, 43, 57, 4),
    {М}(44, 0, 0, 0, 0, 65, 0, 0, 47, 0, 1, 1, 10, 15, 57, 7, 0, 2, 0, 24, 0, 0,
      0, 0, 0, 0, 0, 28, 0, 0, 0, 8, 109, 3),
    {}(139, 0, 0, 1, 11, 108, 0, 4, 152, 0, 7, 0, 1, 69, 161, 0, 0, 8, 25, 24,
      5, 1, 5, 2, 0, 1, 0, 83, 10, 0, 1, 29, 38, 5),
    {О}(0, 72, 139, 76, 74, 32, 32, 19, 12, 52, 21, 93, 68, 72, 7, 34, 93, 102,
      98, 1, 2, 6, 6, 19, 15, 2, 0, 0, 0, 1, 4, 9, 252, 2),
    {П}(17, 0, 0, 0, 0, 43, 0, 0, 14, 0, 1, 9, 0, 1, 125, 3, 120, 1, 2, 8, 0, 0,
      0, 0, 0, 0, 0, 3, 6, 0, 0, 3, 2, 2),
    {Р}(151, 1, 6, 4, 3, 103, 7, 0, 76, 0, 4, 0, 11, 10, 117, 1, 0, 5, 9, 39, 2,
      5, 0, 1, 3, 0, 0, 24, 7, 0, 1, 10, 22, 5),
    {С}(24, 1, 21, 0, 3, 39, 0, 0, 33, 0, 56, 41, 11, 15, 58, 30, 5, 30, 183,
      16, 0, 4, 1, 4, 1, 0, 0, 8, 25, 0, 1, 50, 41, 2),
    {Т}(83, 0, 43, 0, 3, 87, 0, 0, 71, 0, 9, 3, 2, 26, 180, 0, 55, 33, 1, 23, 1,
      0, 1, 4, 0, 0, 0, 20, 78, 0, 0, 5, 82, 4),
    {У}(3, 6, 7, 14, 19, 8, 13, 6, 0, 1, 13, 15, 10, 7, 0, 12, 17, 16, 19, 0, 1,
      3, 0, 12, 5, 8, 0, 0, 0, 0, 22, 1, 65, 0),
    {Ф}(4, 0, 0, 0, 0, 4, 0, 0, 11, 0, 0, 1, 0, 0, 9, 0, 3, 0, 0, 4, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 2, 0),
    {Х}(9, 0, 2, 0, 0, 2, 0, 0, 5, 0, 0, 1, 0, 5, 26, 0, 4, 1, 0, 1, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 76, 0),
    {Ц}(5, 0, 0, 0, 0, 16, 0, 0, 48, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0,
      0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0),
    {Ч}(30, 0, 0, 0, 0, 52, 0, 0, 23, 0, 3, 1, 0, 14, 1, 0, 0, 0, 36, 5, 0, 0,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2),
    {Ш}(13, 0, 0, 0, 0, 28, 0, 0, 17, 0, 4, 4, 0, 4, 3, 0, 0, 0, 1, 3, 0, 0, 0,
      0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 1),
    {Щ}(6, 0, 0, 0, 0, 23, 0, 0, 16, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1),
    {Ъ}(0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 1, 1, 0, 0),
    {Ы}(0, 5, 14, 1, 3, 28, 0, 2, 0, 22, 6, 19, 21, 2, 0, 5, 4, 7, 10, 0, 0, 37,
      0, 3, 4, 0, 0, 0, 0, 0, 0, 1, 84, 0),
    {Ь}(0, 1, 0, 0, 0, 9, 0, 10, 1, 0, 13, 0, 2, 26, 0, 0, 0, 10, 3, 0, 0, 0, 1,
      0, 6, 0, 0, 0, 0, 0, 6, 4, 117, 0),
    {Э}(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 31, 0, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    {Ю}(0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 15, 0, 0, 0, 1, 4,
      1, 15, 0, 0, 0, 0, 0, 0, 38, 0),
    {Я}(0, 0, 9, 2, 7, 10, 3, 19, 0, 0, 1, 6, 7, 8, 0, 0, 2, 6, 19, 0, 0, 3, 5,
      1, 0, 3, 0, 0, 0, 0, 5, 2, 177, 0),
    {_}(42, 80, 193, 43, 109, 41, 18, 53, 159, 0, 144, 27, 83, 176, 187, 229,
      70, 231, 99, 47, 15, 13, 6, 58, 7, 0, 0, 0, 0, 38, 0, 22, 0, 2),
    {?}(0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 4, 4, 8, 0, 0, 5, 3, 4, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
  { " рейтинг"  буквы ? условно принимается равным 1/20 от " рейтинга"  буквы E,
  если сочетание с участием ? корректно, иначе - 0 }
type
  TVariation = array[0..33, 0..33] of Integer;
var
  I, J, iC, iPredC, Max: Integer;
  C: Char;
  CP: TCodePage;
  D, MinD, Factor: Double;
  AMap: PMap;
  PV: ^TVariation;
  Vars: array[TCodePage] of TVariation;
begin
  DetermineRussian := cpWin1251; { по yмолчанию }
  { вычисление распределений биграмм }
  FillChar(Vars, SizeOf(Vars), 0);
  for CP := Low(Vars) to High(Vars) do
  begin
    AMap := GetMap(CP);
    PV := @Vars[CP];
    iPredC := 32;
    for I := 0 to Count - 1 do
    begin
      C := Buf[i];
      iC := 32;
      if C > = #128 then
      begin
        if AMap < > nil then
          C := AMap^[C];
        if not (C in ['?', '?']) then
        begin
          C := Chr(Ord(C) and not 32); { 'a'..'я' ->  'А'..'Я' }
          if C in ['А'..'Я'] then
            iC := Ord(C) - Ord('А');
        end
        else
          iC := 33;
      end;
      Inc(PV^[iPredC, iC]);
      iPredC := iC;
    end;
  end;
  { вычисление метрики и определение наиболее правдоподобной кодировки }
  MinD := 0;
  for CP := Low(Vars) to High(Vars) do
  begin
    PV := @Vars[CP];
    PV^[32, 32] := 0;
    Max := 1;
    for I := 0 to 33 do
      for J := 0 to 33 do
        if PV^[I, J] > Max then
          Max := PV^[I, J];
    Factor := 255 / Max; { ноpмализация }
    D := 0;
    for I := 0 to 33 do
      for J := 0 to 33 do
        D := D + Abs(PV^[I, J] * Factor - ModelBigrams[I, J]);
    if (MinD = 0) or (D < MinD) then
    begin
      MinD := D;
      DetermineRussian := CP;
    end;
  end;
end;

begin
  { тест: слово 'Пример' в разных кодировках (веpоятность ошибок на таких
  коpотких текстах высока - в данном слyчае пpосто повезло!) }
  writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
  writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
  writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
  readln;
end.

Или так:

Код:
const
  l3_csANSI = 0;
  {-признак кодировки ANSII}
  l3_csOEM = 255;
  {-признак кодировки OEM}

type
  cc_Graph_CriteriaRange = #176..#223; {-критерий для определения псевдографики}
  TChars = set of char;
  Long = LongInt;

const
  cc_OEM_CriteriaEx = [#128..#175] + [#224..#239];
  cc_ANSI_CriteriaEx = [#192..#255];
  cc_Graph_Criteria = [Low(cc_Graph_CriteriaRange)..High(cc_Graph_CriteriaRange)];

type
  T_cc_GraphCounts = array [cc_Graph_CriteriaRange] of Longint;

procedure l3AnalizeCharSetEx(var Buf: PChar; BufEnd: PChar;
var OEMCount, ANSICount, GraphCount: Long;
var GraphCounts: T_cc_GraphCounts);
var
  C : Char;
begin
  OEMCount := 0;
  ANSICount := 0;
  GraphCount := 0;
  for C := Low(T_cc_GraphCounts) to High(T_cc_GraphCounts) do GraphCounts[C] := 0;
  while (Buf <  BufEnd) do begin
    C := Buf^;
    Inc(Buf);
    if (C in cc_OEM_CriteriaEx) then Inc(OEMCount);
    if (C in cc_ANSI_CriteriaEx) then Inc(ANSICount);
    if (C in cc_Graph_Criteria) then begin
      Inc(GraphCounts[C]);
      Inc(GraphCount);
    end;
  end;{Buf <  BufEnd}
end;

function l3AnalizeCharSetExEx(Buf, BufEnd: PChar): Byte;
var
  OEMCount : Long;
  ANSICount : Long;
  GraphCount : Long;
  GraphCount_2: Long;
  GraphCounts : T_cc_GraphCounts;
  C : Char;
begin
    l3AnalizeCharSetEx(Buf, BufEnd, OEMCount, ANSICount, GraphCount,GraphCounts);
    if (OEMCount >  ANSICount) then
      Result := l3_csOEM
    else if (GraphCount > = ANSICount) then begin
    Result := 0;
    GraphCount_2 := GraphCount div 2;
    for C := Low(T_cc_GraphCounts) to High(T_cc_GraphCounts) do begin
      If (GraphCounts[C] >  GraphCount_2) then begin
        Result := l3_csOEM;
        break;
      end;{GraphCounts[C] >  ..}
    end;{for C}
  end else Result := 0;
end;

function l3AnalizeCharSetBuf(Buf: PChar; Len: Long): Byte;
begin
  Result := l3AnalizeCharSetExEx(Buf, Buf + Len);
end;
__________________
THE CRACKER IS OUT THERE
Ответить с цитированием
Ответ


Delphi Sources

Опции темы Поиск в этой теме
Поиск в этой теме:

Расширенный поиск
Опции просмотра
Комбинированный вид Комбинированный вид

Ваши права в разделе
Вы не можете создавать темы
Вы не можете отвечать на сообщения
Вы не можете прикреплять файлы
Вы не можете редактировать сообщения

BB-коды Вкл.
Смайлы Вкл.
[IMG] код Вкл.
HTML код Выкл.
Быстрый переход


Часовой пояс GMT +3, время: 23:21.


 

Сайт

Форум

FAQ

RSS лента

Прочее

 

Copyright © Форум "Delphi Sources" by BrokenByte Software, 2004-2023

ВКонтакте   Facebook   Twitter