//C- -*- C++ -*- //C- ------------------------------------------------------------------- //C- DjVuLibre-3.5 //C- Copyright (c) 2002 Leon Bottou and Yann Le Cun. //C- Copyright (c) 2001 AT&T //C- //C- This software is subject to, and may be distributed under, the //C- GNU General Public License, Version 2. The license should have //C- accompanied the software or you may obtain a copy of the license //C- from the Free Software Foundation at http://www.fsf.org . //C- //C- This program is distributed in the hope that it will be useful, //C- but WITHOUT ANY WARRANTY; without even the implied warranty of //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //C- GNU General Public License for more details. //C- //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library //C- distributed by Lizardtech Software. On July 19th 2002, Lizardtech //C- Software authorized us to replace the original DjVu(r) Reference //C- Library notice by the following text (see doc/lizard2002.djvu): //C- //C- ------------------------------------------------------------------ //C- | DjVu (r) Reference Library (v. 3.5) //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved. //C- | The DjVu Reference Library is protected by U.S. Pat. No. //C- | 6,058,214 and patents pending. //C- | //C- | This software is subject to, and may be distributed under, the //C- | GNU General Public License, Version 2. The license should have //C- | accompanied the software or you may obtain a copy of the license //C- | from the Free Software Foundation at http://www.fsf.org . //C- | //C- | The computer code originally released by LizardTech under this //C- | license and unmodified by other parties is deemed "the LIZARDTECH //C- | ORIGINAL CODE." Subject to any third party intellectual property //C- | claims, LizardTech grants recipient a worldwide, royalty-free, //C- | non-exclusive license to make, use, sell, or otherwise dispose of //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU //C- | General Public License. This grant only confers the right to //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to //C- | the extent such infringement is reasonably necessary to enable //C- | recipient to make, have made, practice, sell, or otherwise dispose //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to //C- | any greater extent that may be necessary to utilize further //C- | modifications or combinations. //C- | //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. //C- +------------------------------------------------------------------ // // $Id: DataPool.h,v 1.10 2003/11/07 22:08:20 leonb Exp $ // $Name: release_3_5_15 $ #ifndef _DATAPOOL_H #define _DATAPOOL_H #ifdef HAVE_CONFIG_H #include "config.h" #endif #if NEED_GNUG_PRAGMAS # pragma interface #endif #include "GThreads.h" #include "GString.h" #include "GURL.h" #ifdef HAVE_NAMESPACES namespace DJVU { # ifdef NOT_DEFINED // Just to fool emacs c++ mode } #endif #endif class ByteStream; /** @name DataPool.h Files #"DataPool.h"# and #"DataPool.cpp"# implement classes \Ref{DataPool} and \Ref{DataRange} used by DjVu decoder to access data. The main goal of class \Ref{DataPool} is to provide concurrent access to the same data from many threads with a possibility to add data from yet another thread. It is especially important in the case of the Netscape plugin when data is not immediately available, but decoding should be started as soon as possible. In this situation it is vital to provide transparent access to the data from many threads possibly blocking readers that try to access information that has not been received yet. When the data is local though, it can be accessed directly using standard IO mechanism. To provide a uniform interface for decoding routines, \Ref{DataPool} supports file mode as well. @memo Thread safe data storage @author Andrei Erofeev <eaf@geocities.com> @version #$Id: DataPool.h,v 1.10 2003/11/07 22:08:20 leonb Exp $# */ //@{ /** Thread safe data storage. The purpose of #DataPool# is to provide a uniform interface for accessing data from decoding routines running in a multi-threaded environment. Depending on the mode of operation it may contain the actual data, may be connected to another #DataPool# or may be mapped to a file. Regardless of the mode, the class returns data in a thread-safe way, blocking reading threads if there is no data of interest available. This blocking is especially useful in the networking environment (plugin) when there is a running decoding thread, which wants to start decoding as soon as there is just one byte available blocking if necessary. Access to data in a #DataPool# may be direct (Using \Ref{get_data}() function) or sequential (See \Ref{get_stream}() function). If the #DataPool# is not connected to anything, that is it contains some real data, this data can be added to it by means of two \Ref{add_data}() functions. One of them adds data sequentially maintaining the offset of the last block of data added by it. The other can store data anywhere. Thus it's important to realize, that there may be "white spots" in the data storage. There is also a way to test if data is available for some given data range (See \Ref{has_data}()). In addition to this mechanism, there are so-called {\em trigger callbacks}, which are called, when there is all data available for a given data range. Let us consider all modes of operation in details: \begin{enumerate} \item {\bf Not connected #DataPool#}. In this mode the #DataPool# contains some real data. As mentioned above, it may be added by means of two functions \Ref{add_data}() operating independent of each other and allowing to add data sequentially and directly to any place of data storage. It's important to call function \Ref{set_eof}() after all data has been added. Functions like \Ref{get_data}() or \Ref{get_stream}() can be used to obtain direct or sequential access to the data. As long as \Ref{is_eof}() is #FALSE#, #DataPool# will block every reader, which is trying to read unavailable data until it really becomes available. But as soon as \Ref{is_eof}() is #TRUE#, any attempt to read non-existing data will read #0# bytes. Taking into account the fact, that #DataPool# was designed to store DjVu files, which are in IFF formats, it becomes possible to predict the size of the #DataPool# as soon as the first #32# bytes have been added. This is invaluable for estimating download progress. See function \Ref{get_length}() for details. If this estimate fails (which means, that stored data is not in IFF format), \Ref{get_length}() returns #-1#. Triggers may be added and removed by means of \Ref{add_trigger}() and \Ref{del_trigger}() functions. \Ref{add_trigger}() takes a data range. As soon as all data in that data range is available, the trigger callback will be called. All trigger callbacks will be called when #EOF# condition has been set. \item {\bf #DataPool# connected to another #DataPool#}. In this {\em slave} mode you can map a given #DataPool# to any offsets range inside another #DataPool#. You can connect the slave #DataPool# even if there is no data in the master #DataPool#. Any \Ref{get_data}() request will be forwarded to the master #DataPool#, and it will be responsible for blocking readers trying to access unavailable data. The usage of \Ref{add_data}() functions is prohibited for connected #DataPool#s. The offsets range used to map a slave #DataPool# can be fully specified (both start offset and length are positive numbers) or partially specified (the length is negative). In this mode the slave #DataPool# is assumed to extend up to the end of the master #DataPool#. Triggers may be used with slave #DataPool#s as well as with the master ones. Calling \Ref{stop}() function of a slave will stop only the slave (and any other slave connected to it), but not the master. \Ref{set_eof}() function is meaningless for slaves. They obtain the #ByteStream::EndOfFile# status from their master. Depending on the offsets range passed to the constructor, \Ref{get_length}() returns different values. If the length passed to the constructor was positive, then it is returned by \Ref{get_length}() all the time. Otherwise the value returned is either #-1# if master's length is still unknown (it didn't manage to parse IFF data yet) or it is calculated as #masters_length-slave_start#. \item {\bf #DataPool# connected to a file}. This mode is quite similar to the case, when the #DataPool# is connected to another #DataPool#. Similarly, the #DataPool# stores no data inside. It just forwards all \Ref{get_data}() requests to the underlying source (a file in this case). Thus these requests will never block the reader. But they may return #0# if there is no data available at the requested offset. The usage of \Ref{add_data}() functions is meaningless and is prohibited. \Ref{is_eof}() function always returns #TRUE#. Thus \Ref{set_eof}() us meaningless and does nothing. \Ref{get_length}() function always returns the file size. Calling \Ref{stop}() function will stop this #DataPool# and any other slave connected to it. Trigger callbacks passed through \Ref{add_trigger}() function are called immediately. This mode is useful to read and decode DjVu files without reading and storing them in full in memory. \end{enumerate} */ class DataPool : public GPEnabled { public: // Classes used internally by DataPool // These are declared public to support buggy C++ compilers. class Incrementor; class Reader; class Trigger; class OpenFiles; class OpenFiles_File; class BlockList; class Counter; protected: DataPool(void); public: /** @name Initialization */ //@{ /** Default creator. Will prepare #DataPool# for accepting data added through functions \Ref{add_data}(). Use \Ref{connect}() functions if you want to map this #DataPool# to another or to a file. */ static GP<DataPool> create(void); /** Creates and initialized the #DataPool# with data from stream #str#. The constructor will read the stream's contents and add them to the pool using the \Ref{add_data}() function. Afterwards it will call \Ref{set_eof}() function, and no other data will be allowed to be added to the pool. */ static GP<DataPool> create(const GP<ByteStream> & str); /** Initializes the #DataPool# in slave mode and connects it to the specified offsets range of the specified master #DataPool#. It is equivalent to calling default constructor and function \Ref{connect}(). @param master_pool Master #DataPool# providing data for this slave @param start Beginning of the offsets range which the slave is mapped into @param length Length of the offsets range. If negative, the range is assumed to extend up to the end of the master #DataPool#. */ static GP<DataPool> create(const GP<DataPool> & master_pool, int start=0, int length=-1); /** Initializes the #DataPool# in slave mode and connects it to the specified offsets range of the specified file. It is equivalent to calling default constructor and function \Ref{connect}(). @param url Name of the file to connect to. @param start Beginning of the offsets range which the #DataPool# is mapped into @param length Length of the offsets range. If negative, the range is assumed to extend up to the end of the file. */ static GP<DataPool> create(const GURL &url, int start=0, int length=-1); virtual ~DataPool(); /** Switches the #DataPool# to slave mode and connects it to the specified offsets range of the master #DataPool#. @param master_pool Master #DataPool# providing data for this slave @param start Beginning of the offsets range which the slave is mapped into @param length Length of the offsets range. If negative, the range is assumed to extend up to the end of the master #DataPool#. */ void connect(const GP<DataPool> & master_pool, int start=0, int length=-1); /** Connects the #DataPool# to the specified offsets range of the named #url#. @param url Name of the file to connect to. @param start Beginning of the offsets range which the #DataPool# is mapped into @param length Length of the offsets range. If negative, the range is assumed to extend up to the end of the file. */ void connect(const GURL &url, int start=0, int length=-1); //@} /** Tells the #DataPool# to stop serving readers. If #only_blocked# flag is #TRUE# then only those requests will be processed, which would not block. Any attempt to get non-existing data would result in a #STOP# exception (instead of blocking until data is available). If #only_blocked# flag is #FALSE# then any further attempt to read from this #DataPool# (as well as from any #DataPool# connected to this one) will result in a #STOP# exception. */ void stop(bool only_blocked=false); /** @name Adding data. Please note, that these functions are for not connected #DataPool#s only. You can not add data to a #DataPool#, which is connected to another #DataPool# or to a file. */ //@{ /** Appends the new block of data to the #DataPool#. There are two \Ref{add_data}() functions available. One is for adding data sequentially. It keeps track of the last byte position, which has been stored {\bf by it} and always appends the next block after this position. The other \Ref{add_data}() can store data anywhere. The function will unblock readers waiting for data if this data arrives with this block. It may also trigger some {\em trigger callbacks}, which may have been added by means of \Ref{add_trigger}() function. {\bf Note:} After all the data has been added, it's necessary to call \Ref{set_eof}() to tell the #DataPool# that nothing else is expected. {\bf Note:} This function may not be called if the #DataPool# has been connected to something. @param buffer data to append @param size length of the {\em buffer} */ void add_data(const void * buffer, int size); /** Stores the specified block of data at the specified offset. Like the function above this one can also unblock readers waiting for data and engage trigger callbacks. The difference is that {\bf this} function can store data anywhere. {\bf Note:} After all the data has been added, it's necessary to call \Ref{set_eof}() to tell the #DataPool# that nothing else is expected. {\bf Note:} This function may not be called if the #DataPool# has been connected to something. @param buffer data to store @param offset where to store the data @param size length of the {\em buffer} */ void add_data(const void * buffer, int offset, int size); /** Tells the #DataPool# that all data has been added and nothing else is anticipated. When #EOF# is true, any reader attempting to read non existing data will not be blocked. It will either read #ZERO# bytes or will get an #ByteStream::EndOfFile# exception (see \Ref{get_data}()). Calling this function will also activate all registered trigger callbacks. {\bf Note:} This function is meaningless and does nothing when the #DataPool# is connected to another #DataPool# or to a file. */ void set_eof(void); //@} /** @name Accessing data. These functions provide direct and sequential access to the data of the #DataPool#. If the #DataPool# is not connected (contains some real data) then it handles the requests itself. Otherwise they are forwarded to the master #DataPool# or the file. */ //@{ /** Attempts to return a block of data at the given #offset# of the given #size#. \begin{enumerate} \item If the #DataPool# is connected to another #DataPool# or to a file, the request will just be forwarded to them. \item If the #DataPool# is not connected to anything and some of the data requested is in the internal buffer, the function copies available data to #buffer# and returns immediately. If there is no data available, and \Ref{is_eof}() returns #FALSE#, the reader (and the thread) will be {\bf blocked} until the data actually arrives. Please note, that since the reader is blocked, it should run in a separate thread so that other threads have a chance to call \Ref{add_data}(). If there is no data available, but \Ref{is_eof}() is #TRUE# the behavior is different and depends on the #DataPool#'s estimate of the file size: \begin{itemize} \item If #DataPool# learns from the IFF structure of the data, that its size should be greater than it really is, then any attempt to read non-existing data in the range of {\em valid} offsets will result in an #ByteStream::EndOfFile# exception. This is done to indicate, that there was an error in adding data, and the data requested is {\bf supposed} to be there, but has actually not been added. \item If #DataPool#'s expectations about the data size coincide with the reality then any attempt to read data beyond the legal range of offsets will result in #ZERO# bytes returned. \end{itemize}. \end{enumerate}. @param buffer Buffer to be filled with data @param offset Offset in the #DataPool# to read data at @param size Size of the {\em buffer} @return The number of bytes actually read @exception STOP The stream has been stopped @exception EOF The requested data is not there and will not be added, although it should have been. */ int get_data(void * buffer, int offset, int size); /** Returns a \Ref{ByteStream} to access contents of the #DataPool# sequentially. By reading from the returned stream you basically call \Ref{get_data}() function. Thus, everything said for it remains true for the stream too. */ GP<ByteStream> get_stream(void); //@} /** @name State querying functions. */ //@{ /** Returns #TRUE# if this #DataPool# is connected to another #DataPool# or to a file. */ bool is_connected(void) const; /** Returns #TRUE# if all data available for offsets from #start# till #start+length-1#. If #length# is negative, the range is assumed to extend up to the end of the #DataPool#. This function works both for connected and not connected #DataPool#s. Once it returned #TRUE# for some offsets range, you can be sure that the subsequent \Ref{get_data}() request will not block. */ bool has_data(int start, int length); /* Returns #TRUE# if no more data is planned to be added. {\bf Note:} This function always returns #TRUE# when the #DataPool# has been initialized with a file name. */ bool is_eof(void) const {return eof_flag;} /** Returns the {\em length} of data in the #DataPool#. The value returned depends on the mode of operation: \begin{itemize} \item If the #DataPool# is not connected to anything then the length returned is either calculated by interpreting the IFF structure of stored data (if successful) or by calculating the real size of data after \Ref{set_eof}() has been called. Otherwise it is #-1#. \item If the #DataPool# is connected to a file, the length is calculated basing on the length passed to the \Ref{connect}() function and the file size. \item If the #DataPool# is connected to a master #DataPool#, the length is calculated basing on the value returned by the master's #get_length()# function and the length passed to the \Ref{connect}() function. \end{itemize}. */ int get_length(void) const; /** Returns the number of bytes of data available in this #DataPool#. Contrary to the \Ref{get_length}() function, this one doesn't try to interpret the IFF structure and predict the file length. It just returns the number of bytes of data really available inside the #DataPool#, if it contains data, or inside its range, if it's connected to another #DataPool# or a file. */ int get_size(void) const {return get_size(0, -1);} //@} /** @name Trigger callbacks. {\em Trigger callbacks} are special callbacks called when all data for the given range of offsets has been made available. Since reading unavailable data may result in a thread block, which may be bad, the usage of {\em trigger callbacks} appears to be a convenient way to signal availability of data. You can add a trigger callback in two ways: \begin{enumerate} \item By specifying a range. This is the most general case \item By providing just one {\em threshold}. In this case the range is assumed to start from offset #ZERO# and last for {\em threshold}+1 bytes. \end{enumerate} */ //@{ /** Associates the specified {\em trigger callback} with the given data range. {\bf Note:} The callback may be called immediately if all data for the given range is already available or #EOF# is #TRUE#. @param start The beginning of the range for which all data should be available @param length If the {\em length} is not negative then the callback will be called when there is data available for every offset from {\em start} to {\em start+length-1}. If {\em thresh} is negative, the callback is called after #EOF# condition has been set. @param callback Function to call @param cl_data Argument to pass to the callback when it's called. */ void add_trigger(int start, int length, // void (* callback)(GP<GPEnabled> &), GP<GPEnabled> cl_data); void (* callback)(void *), void * cl_data); /** Associates the specified {\em trigger callback} with the specified threshold. This function is a simplified version of the function above. The callback will be called when there is data available for every offset from #0# to #thresh#, if #thresh# is positive, or when #EOF# condition has been set otherwise. */ // void add_trigger(int thresh, void (* callback)(GP<GPEnabled> &), GP<GPEnabled> cl_data); void add_trigger(int thresh, void (* callback)(void *), void * cl_data); /** Use this function to unregister callbacks, which are no longer needed. {\bf Note!} It's important to do it when the client is about to be destroyed. */ void del_trigger(void (* callback)(void *), void * cl_data); // void del_trigger(void (* callback)(GP<GPEnabled> &), GP<GPEnabled> cl_data); //@} /** Loads data from the file into memory. This function is only useful for #DataPool#s getting data from a file. It descends the #DataPool#s hierarchy until it either reaches a file-connected #DataPool# or #DataPool# containing the real data. In the latter case it does nothing, in the first case it makes the #DataPool# read all data from the file into memory and stop using the file. This may be useful when you want to overwrite the file and leave existing #DataPool#s with valid data. */ void load_file(void); /** This function will make every #DataPool# in the program, which is connected to a file, to load the file contents to the main memory and close the file. This feature is important when you want to do something with the file like remove or overwrite it not affecting the rest of the program. */ static void load_file(const GURL &url); /** This function will remove OpenFiles filelist. */ static void close_all(void); // Internal. Used by 'OpenFiles' void clear_stream(const bool release = true); /** Useful in comparing data pools. Returns true if dirived from same URL or bytestream. */ bool simple_compare(DataPool &pool) const; private: bool eof_flag; bool stop_flag; bool stop_blocked_flag; Counter *active_readers; // Source or storage of data GP<DataPool> pool; GURL furl; GP<OpenFiles_File> fstream; GCriticalSection class_stream_lock; GP<ByteStream> data; GCriticalSection data_lock; BlockList *block_list; int add_at; int start, length; // List of readers waiting for data GPList<Reader> readers_list; GCriticalSection readers_lock; // Triggers GPList<Trigger> triggers_list; // List of passed or our triggers GCriticalSection triggers_lock; // Lock for the list above GCriticalSection trigger_lock; // Lock for static_trigger_cb() void init(void); void wait_for_data(const GP<Reader> & reader); void wake_up_all_readers(void); void check_triggers(void); int get_data(void * buffer, int offset, int size, int level); int get_size(int start, int length) const; void restart_readers(void); // static void static_trigger_cb(GP<GPEnabled> &); static void static_trigger_cb(void *); void trigger_cb(void); void analyze_iff(void); void added_data(const int offset, const int size); public: static const char *Stop; friend class FCPools; }; inline bool DataPool::simple_compare(DataPool &pool) const { // return true if these pools are identical. False means they may or may // not be identical. return (this == &pool) ||(furl.is_valid()&&!furl.is_empty()&&pool.furl.is_valid()&&(furl == pool.furl)) ||(data && (data == pool.data)); } inline bool DataPool::is_connected(void) const { return furl.is_local_file_url() || pool!=0; } //@} #ifdef HAVE_NAMESPACES } # ifndef NOT_USING_DJVU_NAMESPACE using namespace DJVU; # endif #endif #endif