librsync  2.3.3
delta.c
Go to the documentation of this file.
1/*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2 *
3 * librsync -- library for network deltas
4 *
5 * Copyright (C) 2000, 2001 by Martin Pool <mbp@sourcefrog.net>
6 * Copyright (C) 2003 by Donovan Baarda <abo@minkirri.apana.org.au>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published by
10 * the Free Software Foundation; either version 2.1 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23 /*=
24 | Let's climb to the TOP of that
25 | MOUNTAIN and think about STRIP
26 | MINING!!
27 */
28
29/** \file delta.c
30 * Generate in streaming mode an rsync delta given a set of signatures, and a
31 * new file.
32 *
33 * The size of blocks for signature generation is determined by the block size
34 * in the incoming signature.
35 *
36 * To calculate a signature, we need to be able to see at least one block of
37 * the new file at a time. Once we have that, we calculate its weak signature,
38 * and see if there is any block in the signature hash table that has the same
39 * weak sum. If there is one, then we also compute the strong sum of the new
40 * block, and cross check that. If they're the same, then we can assume we have
41 * a match.
42 *
43 * The final block of the file has to be handled a little differently, because
44 * it may be a short match. Short blocks in the signature don't include their
45 * length -- we just allow for the final short block of the file to match any
46 * block in the signature, and if they have the same checksum we assume they
47 * must have the same length. Therefore, when we emit a COPY command, we have
48 * to send it with a length that is the same as the block matched, and not the
49 * block length from the signature.
50 *
51 * Profiling results as of v1.26, 2001-03-18:
52 *
53 * If everything matches, then we spend almost all our time in rs_mdfour64 and
54 * rs_weak_sum, which is unavoidable and therefore a good profile.
55 *
56 * If nothing matches, it is not so good.
57 *
58 * 2002-06-26: Donovan Baarda
59 *
60 * The following is based entirely on pysync. It is much cleaner than the
61 * previous incarnation of this code. It is slightly complicated because in
62 * this case the output can block, so the main delta loop needs to stop when
63 * this happens.
64 *
65 * In pysync a 'last' attribute is used to hold the last miss or match for
66 * extending if possible. In this code, basis_len and scan_pos are used instead
67 * of 'last'. When basis_len > 0, last is a match. When basis_len = 0 and
68 * scan_pos is > 0, last is a miss. When both are 0, last is None (ie,
69 * nothing).
70 *
71 * Pysync is also slightly different in that a 'flush' method is available to
72 * force output of accumulated data. This 'flush' is use to finalise delta
73 * calculation. In librsync input is terminated with an eof flag on the input
74 * stream. I have structured this code similar to pysync with a seperate flush
75 * function that is used when eof is reached. This allows for a flush style API
76 * if one is ever needed. Note that flush in pysync can be used for more than
77 * just terminating delta calculation, so a flush based API can in some ways be
78 * more flexible...
79 *
80 * The input data is first scanned, then processed. Scanning identifies input
81 * data as misses or matches, and emits the instruction stream. Processing the
82 * data consumes it off the input scoop and outputs the processed miss data
83 * into the tube.
84 *
85 * The scoop contains all data yet to be processed. The scan_pos is an index
86 * into the scoop that indicates the point scanned to. As data is scanned,
87 * scan_pos is incremented. As data is processed, it is removed from the scoop
88 * and scan_pos adjusted. Everything gets complicated because the tube can
89 * block. When the tube is blocked, no data can be processed. */
90
91#include <assert.h>
92#include <stdlib.h>
93#include "librsync.h"
94#include "job.h"
95#include "sumset.h"
96#include "checksum.h"
97#include "scoop.h"
98#include "emit.h"
99#include "trace.h"
100
101/** Max length of a miss is 64K including 3 command bytes. */
102#define MAX_MISS_LEN (MAX_DELTA_CMD - 3)
103
105static rs_result rs_delta_s_flush(rs_job_t *job);
106static rs_result rs_delta_s_end(rs_job_t *job);
107static inline rs_result rs_getinput(rs_job_t *job, size_t block_len);
108static inline int rs_findmatch(rs_job_t *job, rs_long_t *match_pos,
109 size_t *match_len);
110static inline rs_result rs_appendmatch(rs_job_t *job, rs_long_t match_pos,
111 size_t match_len);
112static inline rs_result rs_appendmiss(rs_job_t *job, size_t miss_len);
113static inline rs_result rs_appendflush(rs_job_t *job);
114static inline rs_result rs_processmatch(rs_job_t *job);
115static inline rs_result rs_processmiss(rs_job_t *job);
116
117/** Get a block of data if possible, and see if it matches.
118 *
119 * On each call, we try to process all of the input data available on the scoop
120 * and input buffer. */
122{
123 const size_t block_len = job->signature->block_len;
124 rs_long_t match_pos;
125 size_t match_len;
126 rs_result result;
127
128 rs_job_check(job);
129 /* output any pending output from the tube */
130 if ((result = rs_tube_catchup(job)) != RS_DONE)
131 return result;
132 /* read the input into the scoop */
133 if ((result = rs_getinput(job, block_len)) != RS_DONE)
134 return result;
135 /* while output is not blocked and there is a block of data */
136 while ((result == RS_DONE) && ((job->scan_pos + block_len) < job->scan_len)) {
137 /* check if this block matches */
138 if (rs_findmatch(job, &match_pos, &match_len)) {
139 /* append the match and reset the weak_sum */
140 result = rs_appendmatch(job, match_pos, match_len);
141 weaksum_reset(&job->weak_sum);
142 } else {
143 /* rotate the weak_sum and append the miss byte */
144 weaksum_rotate(&job->weak_sum, job->scan_buf[job->scan_pos],
145 job->scan_buf[job->scan_pos + block_len]);
146 result = rs_appendmiss(job, 1);
147 }
148 }
149 /* if we completed OK */
150 if (result == RS_DONE) {
151 /* if we reached eof, we can flush the last fragment */
152 if (job->stream->eof_in) {
153 job->statefn = rs_delta_s_flush;
154 return RS_RUNNING;
155 } else {
156 /* we are blocked waiting for more data */
157 return RS_BLOCKED;
158 }
159 }
160 return result;
161}
162
163static rs_result rs_delta_s_flush(rs_job_t *job)
164{
165 const size_t block_len = job->signature->block_len;
166 rs_long_t match_pos;
167 size_t match_len;
168 rs_result result;
169
170 rs_job_check(job);
171 /* output any pending output from the tube */
172 if ((result = rs_tube_catchup(job)) != RS_DONE)
173 return result;
174 /* read the input into the scoop */
175 if ((result = rs_getinput(job, block_len)) != RS_DONE)
176 return result;
177 /* while output is not blocked and there is any remaining data */
178 while ((result == RS_DONE) && (job->scan_pos < job->scan_len)) {
179 /* check if this block matches */
180 if (rs_findmatch(job, &match_pos, &match_len)) {
181 /* append the match and reset the weak_sum */
182 result = rs_appendmatch(job, match_pos, match_len);
183 weaksum_reset(&job->weak_sum);
184 } else {
185 /* rollout from weak_sum and append the miss byte */
186 weaksum_rollout(&job->weak_sum, job->scan_buf[job->scan_pos]);
187 rs_trace("block reduced to " FMT_SIZE "",
188 weaksum_count(&job->weak_sum));
189 result = rs_appendmiss(job, 1);
190 }
191 }
192 /* if we are not blocked, flush and set end statefn. */
193 if (result == RS_DONE) {
194 result = rs_appendflush(job);
195 job->statefn = rs_delta_s_end;
196 }
197 if (result == RS_DONE) {
198 return RS_RUNNING;
199 }
200 return result;
201}
202
203static rs_result rs_delta_s_end(rs_job_t *job)
204{
205 rs_emit_end_cmd(job);
206 return RS_DONE;
207}
208
209static inline rs_result rs_getinput(rs_job_t *job, size_t block_len)
210{
211 size_t min_len = block_len + MAX_DELTA_CMD;
212
213 job->scan_len = rs_scoop_avail(job);
214 if (job->scan_len < min_len && !job->stream->eof_in)
215 job->scan_len = min_len;
216 return rs_scoop_readahead(job, job->scan_len, (void **)&job->scan_buf);
217}
218
219/** find a match at scan_pos, returning the match_pos and match_len.
220 *
221 * Note that this will calculate weak_sum if required. It will also determine
222 * the match_len.
223 *
224 * This routine could be modified to do xdelta style matches that would extend
225 * matches past block boundaries by matching backwards and forwards beyond the
226 * block boundaries. Extending backwards would require decrementing scan_pos as
227 * appropriate. */
228static inline int rs_findmatch(rs_job_t *job, rs_long_t *match_pos,
229 size_t *match_len)
230{
231 const size_t block_len = job->signature->block_len;
232
233 /* calculate the weak_sum if we don't have one */
234 if (weaksum_count(&job->weak_sum) == 0) {
235 /* set match_len to min(block_len, scan_avail) */
236 *match_len = job->scan_len - job->scan_pos;
237 if (*match_len > block_len) {
238 *match_len = block_len;
239 }
240 /* Update the weak_sum */
241 weaksum_update(&job->weak_sum, job->scan_buf + job->scan_pos,
242 *match_len);
243 rs_trace("calculate weak sum from scratch length " FMT_SIZE "",
244 weaksum_count(&job->weak_sum));
245 } else {
246 /* set the match_len to the weak_sum count */
247 *match_len = weaksum_count(&job->weak_sum);
248 }
249 *match_pos =
250 rs_signature_find_match(job->signature, weaksum_digest(&job->weak_sum),
251 job->scan_buf + job->scan_pos, *match_len);
252 return *match_pos != -1;
253}
254
255/** Append a match at match_pos of length match_len to the delta, extending a
256 * previous match if possible, or flushing any previous miss/match. */
257static inline rs_result rs_appendmatch(rs_job_t *job, rs_long_t match_pos,
258 size_t match_len)
259{
260 rs_result result = RS_DONE;
261
262 /* if last was a match that can be extended, extend it */
263 if (job->basis_len && (job->basis_pos + job->basis_len) == match_pos) {
264 job->basis_len += match_len;
265 } else {
266 /* else appendflush the last value */
267 result = rs_appendflush(job);
268 /* make this the new match value */
269 job->basis_pos = match_pos;
270 job->basis_len = match_len;
271 }
272 /* increment scan_pos to point at next unscanned data */
273 job->scan_pos += match_len;
274 /* we can only process from the scoop if output is not blocked */
275 if (result == RS_DONE) {
276 /* process the match data off the scoop */
277 result = rs_processmatch(job);
278 }
279 return result;
280}
281
282/** Append a miss of length miss_len to the delta, extending a previous miss
283 * if possible, or flushing any previous match.
284 *
285 * This also breaks misses up into 32KB segments to avoid accumulating too much
286 * in memory. */
287static inline rs_result rs_appendmiss(rs_job_t *job, size_t miss_len)
288{
289 rs_result result = RS_DONE;
290
291 /* If last was a match, or MAX_MISS_LEN misses, appendflush it. */
292 if (job->basis_len || (job->scan_pos >= MAX_MISS_LEN)) {
293 result = rs_appendflush(job);
294 }
295 /* increment scan_pos */
296 job->scan_pos += miss_len;
297 return result;
298}
299
300/** Flush any accumulating hit or miss, appending it to the delta. */
302{
303 /* if last is a match, emit it and reset last by resetting basis_len */
304 if (job->basis_len) {
305 rs_trace("matched " FMT_LONG " bytes at " FMT_LONG "!", job->basis_len,
306 job->basis_pos);
307 rs_emit_copy_cmd(job, job->basis_pos, job->basis_len);
308 job->basis_len = 0;
309 return rs_processmatch(job);
310 /* else if last is a miss, emit and process it */
311 } else if (job->scan_pos) {
312 rs_trace("got " FMT_SIZE " bytes of literal data", job->scan_pos);
313 rs_emit_literal_cmd(job, (int)job->scan_pos);
314 return rs_processmiss(job);
315 }
316 /* otherwise, nothing to flush so we are done */
317 return RS_DONE;
318}
319
320/** Process matching data in the scoop.
321 *
322 * The scoop contains match data at scan_buf of length scan_pos. This function
323 * processes that match data, returning RS_DONE if it completes, or RS_BLOCKED
324 * if it gets blocked. After it completes scan_pos is reset to still point at
325 * the next unscanned data.
326 *
327 * This function currently just removes data from the scoop and adjusts
328 * scan_pos appropriately. In the future this could be used for something like
329 * context compressing of miss data. Note that it also calls rs_tube_catchup to
330 * output any pending output. */
332{
333 assert(job->copy_len == 0);
334 rs_scoop_advance(job, job->scan_pos);
335 job->scan_buf += job->scan_pos;
336 job->scan_len -= job->scan_pos;
337 job->scan_pos = 0;
338 return rs_tube_catchup(job);
339}
340
341/** Process miss data in the scoop.
342 *
343 * The scoop contains miss data at scan_buf of length scan_pos. This function
344 * processes that miss data, returning RS_DONE if it completes, or RS_BLOCKED
345 * if it gets blocked. After it completes scan_pos is reset to still point at
346 * the next unscanned data.
347 *
348 * This function uses rs_tube_copy to queue copying from the scoop into output.
349 * and uses rs_tube_catchup to do the copying. This automaticly removes data
350 * from the scoop, but this can block. While rs_tube_catchup is blocked,
351 * scan_pos does not point at legit data, so scanning can also not proceed.
352 *
353 * In the future this could do compression of miss data before outputing it. */
355{
356 assert(job->write_len > 0);
357 rs_tube_copy(job, job->scan_pos);
358 job->scan_buf += job->scan_pos;
359 job->scan_len -= job->scan_pos;
360 job->scan_pos = 0;
361 return rs_tube_catchup(job);
362}
363
364/** State function that does a slack delta containing only literal data to
365 * recreate the input. */
367{
368 size_t avail = rs_scoop_avail(job);
369
370 if (avail) {
371 rs_trace("emit slack delta for " FMT_SIZE " available bytes", avail);
372 rs_emit_literal_cmd(job, (int)avail);
373 rs_tube_copy(job, avail);
374 return RS_RUNNING;
375 } else if (rs_scoop_eof(job)) {
376 job->statefn = rs_delta_s_end;
377 return RS_RUNNING;
378 }
379 return RS_BLOCKED;
380}
381
382/** State function for writing out the header of the encoding job. */
384{
385 rs_emit_delta_header(job);
386 if (job->signature) {
388 } else {
389 rs_trace("no signature provided for delta, using slack deltas");
391 }
392 return RS_RUNNING;
393}
394
396{
397 rs_job_t *job;
398
399 job = rs_job_new("delta", rs_delta_s_header);
400 /* Caller can pass NULL sig or empty sig for "slack deltas". */
401 if (sig && sig->count > 0) {
403 /* Caller must have called rs_build_hash_table() by now. */
404 assert(sig->hashtable);
405 job->signature = sig;
406 weaksum_init(&job->weak_sum, rs_signature_weaksum_kind(sig));
407 }
408 return job;
409}
Abstract wrappers around different weaksum and strongsum implementations.
static int rs_findmatch(rs_job_t *job, rs_long_t *match_pos, size_t *match_len)
find a match at scan_pos, returning the match_pos and match_len.
Definition: delta.c:228
static rs_result rs_processmiss(rs_job_t *job)
Process miss data in the scoop.
Definition: delta.c:354
static rs_result rs_processmatch(rs_job_t *job)
Process matching data in the scoop.
Definition: delta.c:331
rs_job_t * rs_delta_begin(rs_signature_t *sig)
Prepare to compute a streaming delta.
Definition: delta.c:395
static rs_result rs_delta_s_scan(rs_job_t *job)
Get a block of data if possible, and see if it matches.
Definition: delta.c:121
static rs_result rs_delta_s_slack(rs_job_t *job)
State function that does a slack delta containing only literal data to recreate the input.
Definition: delta.c:366
static rs_result rs_appendmiss(rs_job_t *job, size_t miss_len)
Append a miss of length miss_len to the delta, extending a previous miss if possible,...
Definition: delta.c:287
static rs_result rs_appendflush(rs_job_t *job)
Flush any accumulating hit or miss, appending it to the delta.
Definition: delta.c:301
static rs_result rs_delta_s_header(rs_job_t *job)
State function for writing out the header of the encoding job.
Definition: delta.c:383
static rs_result rs_appendmatch(rs_job_t *job, rs_long_t match_pos, size_t match_len)
Append a match at match_pos of length match_len to the delta, extending a previous match if possible,...
Definition: delta.c:257
#define MAX_MISS_LEN
Max length of a miss is 64K including 3 command bytes.
Definition: delta.c:102
encoding output routines.
Generic state-machine interface.
#define rs_job_check(job)
Assert that a job is valid.
Definition: job.h:130
#define MAX_DELTA_CMD
Max length of a singled delta command is including command bytes.
Definition: job.h:44
Public header for librsync.
rs_result
Return codes from nonblocking rsync operations.
Definition: librsync.h:180
@ RS_RUNNING
The job is still running, and not yet finished or blocked.
Definition: librsync.h:183
@ RS_DONE
Completed successfully.
Definition: librsync.h:181
@ RS_BLOCKED
Blocked waiting for more data.
Definition: librsync.h:182
rs_result rs_scoop_readahead(rs_job_t *job, size_t len, void **ptr)
Read from scoop without advancing.
Definition: scoop.c:147
void rs_scoop_advance(rs_job_t *job, size_t len)
Advance the input cursor forward len bytes.
Definition: scoop.c:116
Manage librsync streams of IO.
rs_result rs_tube_catchup(rs_job_t *job)
Put whatever will fit from the tube into the output of the stream.
Definition: tube.c:119
static bool rs_scoop_eof(rs_job_t *job)
Test if the scoop has reached eof.
Definition: scoop.h:100
void rs_tube_copy(rs_job_t *job, size_t len)
Queue up a request to copy through len bytes from the input to the output of the stream.
Definition: tube.c:159
int eof_in
True if there is no more data after this.
Definition: librsync.h:345
The contents of this structure are private.
Definition: job.h:47
rs_long_t basis_pos
Copy from the basis position.
Definition: job.h:117
rs_byte_t * scan_buf
The delta scan buffer, where scan_buf[scan_pos..scan_len] is the data yet to be scanned.
Definition: job.h:104
size_t copy_len
If copy_len is >0, then that much data should be copied through from the input.
Definition: job.h:114
size_t scan_pos
The delta scan position.
Definition: job.h:106
size_t scan_len
The delta scan buffer length.
Definition: job.h:105
rs_result(* statefn)(rs_job_t *)
Callback for each processing step.
Definition: job.h:56
rs_signature_t * signature
Pointer to the signature that's being used by the operation.
Definition: job.h:72
weaksum_t weak_sum
The rollsum weak signature accumulator used by delta.c.
Definition: job.h:84
Signature of a whole file.
Definition: sumset.h:44
int count
Total number of blocks.
Definition: sumset.h:48
hashtable_t * hashtable
The hashtable for finding matches.
Definition: sumset.h:51
int block_len
The block length.
Definition: sumset.h:46
The rs_signature class implementation of a file signature.
static weaksum_kind_t rs_signature_weaksum_kind(rs_signature_t const *sig)
Get the weaksum kind for a signature.
Definition: sumset.h:114
#define rs_signature_check(sig)
Assert that a signature is valid.
Definition: sumset.h:107
logging functions.