summaryrefslogtreecommitdiff
path: root/chromium/content/browser/download/save_package.h
blob: 742fd2f0b86175519b1a79f6a99bef5f0a4949b5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_
#define CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_

#include <stddef.h>
#include <stdint.h>

#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>

#include "base/containers/circular_deque.h"
#include "base/files/file_path.h"
#include "base/gtest_prod_util.h"
#include "base/macros.h"
#include "base/memory/ref_counted.h"
#include "base/memory/weak_ptr.h"
#include "base/time/time.h"
#include "components/download/public/common/download_item.h"
#include "content/browser/download/save_types.h"
#include "content/common/content_export.h"
#include "content/public/browser/download_manager_delegate.h"
#include "content/public/browser/save_page_type.h"
#include "content/public/browser/web_contents_observer.h"
#include "content/public/common/referrer.h"
#include "net/base/net_errors.h"
#include "services/metrics/public/cpp/ukm_source_id.h"
#include "url/gurl.h"

class GURL;

namespace download {
class DownloadItemImpl;
}

namespace content {
class DownloadManagerImpl;
class FrameTreeNode;
class RenderFrameHostImpl;
struct SavableSubframe;
class SaveFileManager;
class SaveItem;
class SavePackage;
class WebContents;

// SavePackage manages the process of saving a page as only-HTML, complete-HTML
// or MHTML and provides status information about the job.
// - only-html: the web page is saved to a single HTML file excluding
// sub-resources and sub-frames
// - complete-html: the web page's main frame HTML is saved to the user selected
// file and a directory for the auxiliary files such as all sub-frame html
// files, image files, css files and js files is created
// - MHTML: the main frame and all auxiliary files are stored a single text
//   file using the MHTML format.
//
// Each page saving job may include one or multiple files which need to be
// saved. Each file is represented by a SaveItem, and all SaveItems are owned
// by the SavePackage. SaveItems are created when a user initiates a page
// saving job, and exist for the duration of one contents's life time.
class CONTENT_EXPORT SavePackage
    : public base::RefCountedThreadSafe<SavePackage>,
      public WebContentsObserver,
      public base::SupportsWeakPtr<SavePackage> {
 public:
  enum WaitState {
    // State when created but not initialized.
    INITIALIZE = 0,
    // State when after initializing, but not yet saving.
    START_PROCESS,
    // Waiting on a list of savable resources from the backend.
    RESOURCES_LIST,
    // Waiting for data sent from net IO or from file system.
    NET_FILES,
    // Waiting for html DOM data sent from render process.
    HTML_DATA,
    // Saving page finished successfully.
    SUCCESSFUL,
    // Failed to save page.
    FAILED
  };

  static const base::FilePath::CharType kDefaultHtmlExtension[];

  // Constructor for user initiated page saving. This constructor results in a
  // SavePackage that will generate and sanitize a suggested name for the user
  // in the "Save As" dialog box.
  explicit SavePackage(WebContents* web_contents);

  // Initialize the SavePackage. Returns true if it initializes properly.  Need
  // to make sure that this method must be called in the UI thread because using
  // g_browser_process on a non-UI thread can cause crashes during shutdown.
  // |cb| will be called when the download::DownloadItem is created, before data
  // is written to disk.
  bool Init(const SavePackageDownloadCreatedCallback& cb);

  // Cancel all in progress request, might be called by user or internal error.
  void Cancel(bool user_action, bool cancel_download_item = true);

  void Finish();

  // Notifications sent from the download sequence to the UI thread.
  void StartSave(const SaveFileCreateInfo* info);
  bool UpdateSaveProgress(SaveItemId save_item_id,
                          int64_t size,
                          bool write_success);
  // Called for updating end state.
  void SaveFinished(SaveItemId save_item_id, int64_t size, bool is_success);
  void SaveCanceled(const SaveItem* save_item);

  // Calculate the percentage of whole save page job.
  // Rough percent complete, -1 means we don't know (since we didn't receive a
  // total size).
  int PercentComplete();

  bool canceled() const { return user_canceled_ || disk_error_occurred_; }
  bool finished() const { return finished_; }
  SavePageType save_type() const { return save_type_; }

  SavePackageId id() const { return unique_id_; }

  void GetSaveInfo();

 private:
  friend class base::RefCountedThreadSafe<SavePackage>;

  // Friends for testing. Needed for accessing the test-only constructor below.
  friend class SavePackageTest;
  friend class WebContentsImpl;
  FRIEND_TEST_ALL_PREFIXES(SavePackageTest, TestSuggestedSaveNames);
  FRIEND_TEST_ALL_PREFIXES(SavePackageTest, TestLongSafePureFilename);
  FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, ImplicitCancel);
  FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, ExplicitCancel);
  FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, DownloadItemDestroyed);

  // Map from SaveItem::id() (aka save_item_id) into a SaveItem.
  using SaveItemIdMap = std::
      unordered_map<SaveItemId, std::unique_ptr<SaveItem>, SaveItemId::Hasher>;

  using FileNameSet = std::set<base::FilePath::StringType,
                               bool (*)(base::FilePath::StringPieceType,
                                        base::FilePath::StringPieceType)>;

  using FileNameCountMap =
      std::unordered_map<base::FilePath::StringType, uint32_t>;

  // Used only for testing. Bypasses the file and directory name generation /
  // sanitization by providing well known paths better suited for tests.
  SavePackage(WebContents* web_contents,
              SavePageType save_type,
              const base::FilePath& file_full_path,
              const base::FilePath& directory_full_path);

  ~SavePackage() override;

  void InitWithDownloadItem(
      const SavePackageDownloadCreatedCallback& download_created_callback,
      download::DownloadItemImpl* item);

  // Callback for WebContents::GenerateMHTML().
  void OnMHTMLGenerated(int64_t size);

  // Notes from Init() above applies here as well.
  void InternalInit();

  void Stop(bool cancel_download_item);
  void CheckFinish();

  // Initiate a saving job of a specific URL. We send the request to
  // SaveFileManager, which will dispatch it to different approach according to
  // the save source. |process_all_remaining_items| indicates whether we need to
  // save all remaining items.
  void SaveNextFile(bool process_all_remainder_items);

  // Continue processing the save page job after one SaveItem has been finished.
  void DoSavingProcess();

  // WebContentsObserver implementation.
  bool OnMessageReceived(const IPC::Message& message,
                         RenderFrameHost* render_frame_host) override;

  // Update the download history of this item upon completion.
  void FinalizeDownloadEntry();

  // Return max length of a path for a specific base directory.
  // This is needed on POSIX, which restrict the length of file names in
  // addition to the restriction on the length of path names.
  // |base_dir| is assumed to be a directory name with no trailing slash.
  static uint32_t GetMaxPathLengthForDirectory(const base::FilePath& base_dir);

  // Truncates a filename to fit length constraints.
  //
  // |directory|    : Directory containing target file.
  // |extension|    : Extension.
  // |max_path_len| : Maximum size allowed for |len(directory + base_name +
  //                  extension|.
  // |base_name|    : Variable portion. The length of this component will be
  //                  adjusted to fit the length constraints described at
  //                  |max_path_len| above.
  //
  // Returns true if |base_name| could be successfully adjusted to fit the
  // aforementioned constraints, or false otherwise.
  // TODO(asanka): This function is wrong. |base_name| cannot be truncated
  //   without knowing its encoding and truncation has to be performed on
  //   character boundaries. Also the implementation doesn't look up the actual
  //   path constraints and instead uses hard coded constants. crbug.com/618737
  static bool TruncateBaseNameToFitPathConstraints(
      const base::FilePath& directory,
      const base::FilePath::StringType& extension,
      uint32_t max_path_len,
      base::FilePath::StringType* base_name);

  // Create a file name based on the response from the server.
  bool GenerateFileName(const std::string& disposition,
                        const GURL& url,
                        bool need_html_ext,
                        base::FilePath::StringType* generated_name);

  // Main routine that initiates asking all frames for their savable resources.
  //
  // Responses are received asynchronously by OnSavableResourceLinks... methods
  // and pending responses are counted/tracked by
  // CompleteSavableResourceLinksResponse.
  //
  // OnSavableResourceLinksResponse creates SaveItems for each savable resource
  // and each subframe - these SaveItems get enqueued into |waiting_item_queue_|
  // with the help of CreatePendingSaveItem, EnqueueSavableResource,
  // EnqueueFrame.
  void GetSavableResourceLinks();

  // Response from |sender| frame to GetSavableResourceLinks request.
  void OnSavableResourceLinksResponse(
      RenderFrameHostImpl* sender,
      const std::vector<GURL>& resources_list,
      const Referrer& referrer,
      const std::vector<SavableSubframe>& subframes);

  // Helper for finding or creating a SaveItem with the given parameters.
  SaveItem* CreatePendingSaveItem(
      int container_frame_tree_node_id,
      int save_item_frame_tree_node_id,
      const GURL& url,
      const Referrer& referrer,
      SaveFileCreateInfo::SaveFileSource save_source);

  // Helper for finding a SaveItem with the given url, or falling back to
  // creating a SaveItem with the given parameters.
  void CreatePendingSaveItemDeduplicatingByUrl(
      int container_frame_tree_node_id,
      int save_item_frame_tree_node_id,
      const GURL& url,
      const Referrer& referrer,
      SaveFileCreateInfo::SaveFileSource save_source);

  // Helper to enqueue a savable resource reported by GetSavableResourceLinks.
  void EnqueueSavableResource(int container_frame_tree_node_id,
                              const GURL& url,
                              const Referrer& referrer);
  // Helper to enqueue a subframe reported by GetSavableResourceLinks.
  void EnqueueFrame(int container_frame_tree_node_id,
                    int frame_tree_node_id,
                    const GURL& frame_original_url);

  // Response to GetSavableResourceLinks that indicates an error when processing
  // the frame associated with |sender|.
  void OnSavableResourceLinksError(RenderFrameHostImpl* sender);

  // Helper tracking how many |number_of_frames_pending_response_| we have
  // left and kicking off the next phase after we got all the
  // OnSavableResourceLinksResponse messages we were waiting for.
  void CompleteSavableResourceLinksResponse();

  // For each frame in the current page, ask the renderer process associated
  // with that frame to serialize that frame into html.
  void GetSerializedHtmlWithLocalLinks();

  // Ask renderer process to serialize |target_tree_node| into html data
  // with resource links replaced with a link to a locally saved copy.
  void GetSerializedHtmlWithLocalLinksForFrame(FrameTreeNode* target_tree_node);

  // Routes html data (sent by renderer process in response to
  // GetSerializedHtmlWithLocalLinksForFrame above) to the associated local file
  // (and also keeps track of when all frames have been completed).
  void OnSerializedHtmlWithLocalLinksResponse(RenderFrameHostImpl* sender,
                                              const std::string& data,
                                              bool end_of_data);

  // Look up SaveItem by save item id from in progress map.
  SaveItem* LookupInProgressSaveItem(SaveItemId save_item_id);

  // Remove SaveItem from in progress map and put it to saved map.
  void PutInProgressItemToSavedMap(SaveItem* save_item);

  // Retrieves the URL to be saved from the WebContents.
  static GURL GetUrlToBeSaved(WebContents* web_contents);

  static base::FilePath CreateDirectoryOnFileThread(
      const base::string16& title,
      const GURL& page_url,
      bool can_save_as_complete,
      const std::string& mime_type,
      const base::FilePath& website_save_dir,
      const base::FilePath& download_save_dir);
  void ContinueGetSaveInfo(bool can_save_as_complete,
                           const base::FilePath& suggested_path);
  void OnPathPicked(
      const base::FilePath& final_name,
      SavePageType type,
      const SavePackageDownloadCreatedCallback& cb);

  // The number of in process SaveItems.
  int in_process_count() const {
    return static_cast<int>(in_progress_items_.size());
  }

  // The number of all SaveItems which have completed, including success items
  // and failed items.
  int completed_count() const {
    return static_cast<int>(saved_success_items_.size() +
                            saved_failed_items_.size());
  }

  // The current speed in files per second. This is used to update the
  // download::DownloadItem associated to this SavePackage. The files per second
  // is presented by the download::DownloadItem to the UI as bytes per second,
  // which is not correct but matches the way the total and received number of
  // files is presented as the total and received bytes.
  int64_t CurrentSpeed() const;

  // A queue for items we are about to start saving.
  base::circular_deque<std::unique_ptr<SaveItem>> waiting_item_queue_;

  // Map of all saving job in in-progress state.
  SaveItemIdMap in_progress_items_;

  // Map of all saving job which are failed.
  SaveItemIdMap saved_failed_items_;

  // Used to de-dupe urls that are being gathered into |waiting_item_queue_|
  // and also to find SaveItems to associate with a containing frame.
  // Note that |url_to_save_item_| does NOT own SaveItems - they
  // remain owned by waiting_item_queue_, in_progress_items_, etc.
  std::map<GURL, SaveItem*> url_to_save_item_;

  // Map used to route responses from a given a subframe (i.e.
  // OnSerializedHtmlWithLocalLinksResponse) to the right SaveItem.
  // Note that |frame_tree_node_id_to_save_item_| does NOT own SaveItems - they
  // remain owned by waiting_item_queue_, in_progress_items_, etc.
  std::unordered_map<int, SaveItem*> frame_tree_node_id_to_save_item_;

  // Used to limit which local paths get exposed to which frames
  // (i.e. to prevent information disclosure to oop frames).
  // Note that |frame_tree_node_id_to_contained_save_items_| does NOT own
  // SaveItems - they remain owned by waiting_item_queue_, in_progress_items_,
  // etc.
  std::unordered_map<int, std::vector<SaveItem*>>
      frame_tree_node_id_to_contained_save_items_;

  // Number of frames that we still need to get a response from.
  int number_of_frames_pending_response_ = 0;

  // Map of all saving job which are successfully saved.
  SaveItemIdMap saved_success_items_;

  // Non-owning pointer for handling file writing on the download sequence.
  SaveFileManager* file_manager_ = nullptr;

  // DownloadManager owns the download::DownloadItem and handles history and UI.
  DownloadManagerImpl* download_manager_ = nullptr;
  download::DownloadItemImpl* download_ = nullptr;

  // The URL of the page the user wants to save.
  const GURL page_url_;
  base::FilePath saved_main_file_path_;
  base::FilePath saved_main_directory_path_;

  // The title of the page the user wants to save.
  const base::string16 title_;

  // Used to calculate package download speed (in files per second).
  const base::TimeTicks start_tick_;

  // Indicates whether the actual saving job is finishing or not.
  bool finished_ = false;

  // Indicates whether user canceled the saving job.
  bool user_canceled_ = false;

  // Indicates whether user get disk error.
  bool disk_error_occurred_ = false;

  // Variables to record errors that happened so we can record them via
  // UMA statistics.
  bool wrote_to_completed_file_ = false;
  bool wrote_to_failed_file_ = false;

  // Type about saving page as only-html or complete-html.
  SavePageType save_type_ = SAVE_PAGE_TYPE_UNKNOWN;

  // Number of all need to be saved resources.
  size_t all_save_items_count_ = 0;

  // This set is used to eliminate duplicated file names in saving directory.
  FileNameSet file_name_set_;

  // This map is used to track serial number for specified filename.
  FileNameCountMap file_name_count_map_;

  // Indicates current waiting state when SavePackage try to get something
  // from outside.
  WaitState wait_state_ = INITIALIZE;

  // Unique ID for this SavePackage.
  const SavePackageId unique_id_;

  // UKM IDs for reporting.
  ukm::SourceId ukm_source_id_;
  uint64_t ukm_download_id_;

  DISALLOW_COPY_AND_ASSIGN(SavePackage);
};

}  // namespace content

#endif  // CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_