数据恢复—Office 文档修复原理
上节介绍的“劳拉”文件格式,对修复损坏的Office文档有什么帮助呢?其实Office文档的修复过程, 与硬盘的FAT 文件系统的数据恢复过程十分相似。以修复一个损坏的 Word 文件为例,其过程如下表7-4 所示:
|
1. |
通过文件块头确定目录链根的开始块序号; |
|
2. |
提取文件根目录结构; |
|
3. |
通过目录链表定位要挽救的OLE 对象的位置; |
|
4. |
拷贝文件未损坏的OLE 对象; |
|
5 |
对得到的OLE 对象进行组合,重新构造一个新的Word 文档。 |
表7-4 Word 文档的修复过程
得到的未损坏的OLE 对象越多, 被修复的Word 文档就越理想。如果想自己编写文件修复程序,这里提供一个参考程序ole.c ,能够给出很多有参考价值的信息。
|
|
程序:ole.c |
|
. |
编程语言: Borland C++ |
|
. |
功能:列出一个 Word 文件的内部目录结构 |
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <ctype.h>
#include <sys/types.h>
#include <assert.h>
#define MIN(a,b) ((a)<(b) ? (a) : (b))
#define MAXBLOCKS 64
struct pps_block
{
char name[64];
int nsize;
char type;
struct pps_block *previous;
struct pps_block *next;
struct pps_block *directory;
long int start;
long int size;
int level;
int index;
};
typedef struct pps_block pps_entry;
char *pps_type[]={"","DIR ","FILE","","","ROOT"};
/* Routine prototypes */
unsigned short int ShortInt(unsigned char* array);
unsigned long int LongInt(unsigned char* array);
unsigned short int ShortInt(unsigned char* array)
{
union two_byte {
unsigned short int num;
char ch[2];
} Short;
#ifndef INTEL
Short.ch[1] = *array++;
Short.ch[0] = *array;
#else
Short.ch[0] = *array++;
Short.ch[1] = *array;
#endif
return Short.num;
}
unsigned long int LongInt(unsigned char* array)
{
union four_byte {
unsigned long int num;
char ch[4];
} Long;
#ifndef INTEL
Long.ch[3] = *array++;
Long.ch[2] = *array++;
Long.ch[1] = *array++;
Long.ch[0] = *array;
#else
Long.ch[0] = *array++;
Long.ch[1] = *array++;
Long.ch[2] = *array++;
Long.ch[3] = *array;
#endif
return Long.num;
}
/* recurse to follow forward/backward list of root pps's */
void unravel(pps_entry *pps_node, int level)
{
if(pps_node->nsize == 0) return;
if(pps_node->previous != NULL) unravel(pps_node->previous,level);
pps_node->level = level;
printf("PPS %s: %*x: ->%s\n",pps_type[pps_node->type],level*3,pps_node->index,pps_node->name);
if(pps_node->directory != NULL) unravel(pps_node->directory,level+1);
if(pps_node->next != NULL) unravel(pps_node->next,level);
}
int main(int argc, char **argv)
{
FILE *input = NULL;
FILE *OLEfile = NULL;
FILE *sbfile = NULL;
FILE *infile = NULL;
char Target[64];
int debug = 0, BlockSize = 0, Offset = 0;
int c, i, j, k, len, bytes;
char *s, *p, *t;
char *Block, *BDepot, *SDepot, *Depot, *Root;
char Name[64];
unsigned long int FilePos=0x00000000;
long int num_bbd_blocks;
long int root_list[MAXBLOCKS], sbd_list[MAXBLOCKS];
long int pps_size, pps_start = -1;
long int linkto;
int root_entry;
pps_entry **pps_list;
if(argc < 2) {
fprintf(stderr,"No input file name\n");
exit (12);
}
fprintf(stderr,"File given was %s\n",argv[1]);
input = fopen(argv[1], "rb");
if(input==NULL) {
fprintf(stderr,"Error opening file %s\n",argv[1]);
exit (12);
}
if(argc < 3) {
fprintf(stderr,"Listing contents\n");
strncpy(Target,"UnLiKeLy",8);
} else {
strncpy(Target, argv[2], 64);
fprintf(stderr, "Extracting %s...\n", Target);
}
/* peek into file to guess file type */
c = getc (input);
ungetc(c,input);
if(isprint(c)) {
fprintf(stderr,"File looks like a plain text file.\n");
return 8;
/* check for MS OLE wrapper */
} else if(c == 0xd0) {
Block =(char *) malloc(512);
/* read header block */
if(fread(Block,512,1,input) != 1 ) {
fprintf(stderr,"1 =========> Input file has faulty OLE format\n");
exit (5);
}
num_bbd_blocks=LongInt(Block+0x2c);
BDepot =(char *) malloc(512*num_bbd_blocks);
s = BDepot;
root_list[0]= LongInt(Block+0x30);
sbd_list[0] = LongInt(Block+0x3c);
if(debug) fprintf(stderr,"num_bbd_blocks %ld, root start %ld, sbd start %ld\n",num_bbd_blocks,root_list[0],sbd_list[0]);
/* read big block Depot */
for(i=0; i<(int)num_bbd_blocks; i++) {
FilePos = 512*(LongInt(Block+0x4c+(i*4))+1);
fseek(input, FilePos, SEEK_SET);
if(fread(s,512,1,input) != 1) {
fprintf(stderr,"2 =========> Input file has faulty bbd\n");
exit (5);
}
s += 0x200;
}
/* Extract the sbd block list */
for(len = 1; len < MAXBLOCKS; len++){
sbd_list[len] = LongInt(BDepot+(sbd_list[len-1]*4));
if(sbd_list[len] == -2) break;
}
if(len >= MAXBLOCKS) fprintf(stderr,"Help too many sbd blocks\n");
SDepot =(char *) malloc(512*len);
s = SDepot;
/* Read in Small Block Depot */
for(i=0; i<len; i++) {
FilePos = 512 *(sbd_list[i]+1);
fseek(input, FilePos, SEEK_SET);
if(fread(s, 512, 1, input) != 1 ) {
fprintf(stderr,"3 =========> Input file has faulty OLE format\n");
return 5;
}
s += 0x200;
}
/* Extract the root block list */
for(len = 1; len < MAXBLOCKS; len++){
root_list[len] = LongInt(BDepot+(root_list[len-1]*4));
fprintf(stderr,"root block %d\n",len);
if(root_list[len] == -2) break;
}
if(len >= MAXBLOCKS) fprintf(stderr,"Help too many root blocks\n");
Root =(char *) malloc(512*len);
s = Root;
/* Read in Root stream data */
for(i=0; i<len; i++) {
FilePos = 512 *(root_list[i]+1);
fseek(input,FilePos,SEEK_SET);
if(fread(s,512,1,input) != 1) {
fprintf(stderr,"4 =========> Input file has faulty OLE format\n");
return 5;
}
s += 0x200;
}
/* assign space for pps list */
pps_list = (pps_entry **)malloc(len*4*sizeof(pps_entry *));
for(j=0; j<len*4; j++) pps_list[j] =(pps_entry *)malloc(sizeof(pps_entry));
/* Store pss entry details and look out for Root Entry */
for(j=0; j<len*4; j++) {
pps_list[j]->level = -1;
pps_list[j]->index = j;
s = Root+(j*0x80);
/* some pps names have first byte as an integer !!
so we make it visible so you can extract a named pps */
if(!isprint(*s)) *s = *s + 48;
pps_list[j]->nsize=ShortInt(s+0x40);
if(pps_list[j]->nsize == 0) continue;
for(p=pps_list[j]->name,t=s; t<s+pps_list[j]->nsize; t++) *p++ = *t++;
s+=0x42;
pps_list[j]->type = *s;
if(pps_list[j]->type == 5) {
root_entry = j; /* this is root */
}
s+=0x02;
linkto = LongInt(s);
if(linkto != -1) pps_list[j]->previous = pps_list[linkto];
else pps_list[j]->previous = NULL;
s+=0x04;
linkto = LongInt(s);
if(linkto != -1) pps_list[j]->next = pps_list[linkto];
else pps_list[j]->next = NULL;
s+=0x04;
linkto = LongInt(s);
if(linkto != -1) pps_list[j]->directory = pps_list[linkto];
else pps_list[j]->directory = NULL;
s+=0x28;
pps_list[j]->start = LongInt(s);
s+=0x04;
pps_list[j]->size = LongInt(s);
}
/* go through the pps entries, tagging them with level number
use recursive routine to follow list starting at root entry */
unravel(pps_list[root_entry],0);
/* go through the level 0 list looking for named entries */
for(j=0; j<len*4; j++) {
if(pps_list[j]->nsize == 0) continue; /* skip empty pps */
/* we mostly only want the top level (level 1) stuff, so
here we skip anything more deeply nested. */
if(pps_list[j]->level > 1) continue;
pps_start = pps_list[j]->start;
pps_size = pps_list[j]->size;
OLEfile = NULL;
if(pps_list[j]->type == 5) { /* Root entry */
OLEfile = tmpfile();
sbfile = OLEfile;
if(debug) fprintf(stderr,"Reading sbFile %ld\n",pps_start);
}
else if(!strcmp(pps_list[j]->name, Target)) {
OLEfile=fopen("OLE.tmp","w+b"); /* try and open */
printf("Reading Target %s\n", Target);
}
if(pps_size<=0) OLEfile = NULL;
if(OLEfile == NULL) continue;
if(pps_size>=4096 | OLEfile==sbfile) {
Offset = 1;
BlockSize = 512;
infile = input;
Depot = BDepot;
} else {
Offset = 0;
BlockSize = 64;
infile = sbfile;
Depot = SDepot;
}
while(pps_start != -2) {
if(debug) fprintf(stderr,"Reading block %ld\n",pps_start);
FilePos = (pps_start+Offset)* BlockSize;
bytes = MIN(BlockSize,pps_size);
fseek(infile,FilePos,SEEK_SET);
if(fread(Block,bytes,1,infile) != 1) {
fprintf(stderr,"5 =========> Input file has faulty OLE format\n");
exit (5);
}
fwrite(Block,bytes,1,OLEfile);
pps_start = LongInt(Depot+(pps_start*4));
pps_size -= BlockSize;
if(pps_size <= 0) pps_start=-2;
}
rewind(OLEfile);
}
for(j=0; j<len*4; j++) free(pps_list[j]);
free(pps_list);
free(Root);
free(BDepot);
free(Block);
fclose(input);
return 0;
} else {
/* not a OLE file! */
fprintf(stderr,"7 =========> Input file is not an OLE file\n");
exit (8);
}
return 0;
}
对上述 Ole.c程序编译和连接之后,生成 ole.exe 程序。
例子:列出Word 文档 a.doc 的目录结构。
在DOS 提示符下执行命令: ole a.doc
本节介绍的Office 文档的数据恢复的原理和程序设计方法,都来源于对Office 文档结构的分析。如果对Office 文档的结构进行更深入的研究,将会发现更好的修复方法,从而挽救更多的数据,并有可能开发出能对各类受损Office 文档自动进行全面恢复的工具软件。
《数据恢复高级技术》汪中夏老师
北京信息工程学院数据恢复实验室 |