summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2012-12-07 14:37:35 -0800
committerSage Weil <sage@inktank.com>2012-12-07 14:37:35 -0800
commit4ef528eee4c2a3132aa840e86bf19843399710c0 (patch)
tree4697b02b357867eea650cd9410ddb4ce4371927b
parent1adce68f7e66389fb50fffa893699ea28dd4d494 (diff)
downloadceph-4ef528eee4c2a3132aa840e86bf19843399710c0.tar.gz
osd: implement prealloc/fallocate object operation
Implement a rados PREALLOC method that will call fallocate(2) to allocate disk blocks for a while, but not write to them. We choose the semantics that modify the file size so that the exposed object metadata will be less confusing. e.e.g, prealloc to 4MB will result in a 4MB object full of zeros (or whatever data was prevoiusly written). Include flags for only doing prealloc on object creation, and for only doing prealloc on an existing object. Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/include/rados.h11
-rw-r--r--src/os/FileStore.cc47
-rw-r--r--src/os/FileStore.h1
-rw-r--r--src/os/ObjectStore.h11
-rw-r--r--src/osd/ReplicatedPG.cc18
-rw-r--r--src/osd/osd_types.cc3
-rw-r--r--src/osdc/Objecter.h6
7 files changed, 97 insertions, 0 deletions
diff --git a/src/include/rados.h b/src/include/rados.h
index 073ad62bd5f..9a7aa159862 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -195,6 +195,8 @@ enum {
CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
+ CEPH_OSD_OP_PREALLOC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 16,
+
/* omap */
CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
@@ -348,6 +350,11 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
+enum {
+ CEPH_OSD_PREALLOC_FLAG_ONCREATE = 1, /* on creation only */
+ CEPH_OSD_PREALLOC_FLAG_IFEXISTS = 2, /* if exists only */
+};
+
/*
* an individual object operation. each may be accompanied by some data
* payload
@@ -389,6 +396,10 @@ struct ceph_osd_op {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 offset, length;
+ __le64 flags; /* CEPH_OSD_PREALLOC_FLAG_* */
+ } __attribute__ ((packed)) prealloc;
};
__le32 payload_len;
} __attribute__ ((packed));
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 2c66a5ea7db..e2bd4988a09 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -2351,6 +2351,17 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
}
break;
+ case Transaction::OP_FALLOCATE:
+ {
+ coll_t cid = i.get_cid();
+ hobject_t oid = i.get_oid();
+ uint64_t off = i.get_length();
+ uint64_t len = i.get_length();
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _fallocate(cid, oid, off, len);
+ }
+ break;
+
case Transaction::OP_TRIMCACHE:
{
i.get_cid();
@@ -2932,6 +2943,42 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
return ret;
}
+int FileStore::_fallocate(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len)
+{
+ dout(15) << "fallocate " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int ret = 0;
+
+ int fd = lfn_open(cid, oid, O_RDONLY);
+ if (fd < 0) {
+ ret = -errno;
+ goto out;
+ }
+
+ // try the real way
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(DARWIN) && !defined(__FreeBSD__)
+ ret = fallocate(fd, 0, offset, len);
+ if (ret < 0)
+ ret = -errno;
+ goto out_close;
+# endif
+#endif
+
+ // oh well; just make sure we adjust i_size
+ struct stat st;
+ ret = ::fstat(fd, &st);
+ if (ret == 0 && st.st_size < offset + len)
+ ret = ::ftruncate(fd, offset + len);
+ if (ret < 0)
+ ret = -errno;
+
+ out_close:
+ TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(20) << "fallocate " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+ return ret;
+}
+
int FileStore::_clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
const SequencerPosition& spos)
{
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index f18e1f88269..8d300b094e2 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -367,6 +367,7 @@ public:
int _touch(coll_t cid, const hobject_t& oid);
int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl);
int _zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
+ int _fallocate(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
int _truncate(coll_t cid, const hobject_t& oid, uint64_t size);
int _clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
const SequencerPosition& spos);
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 439897f273a..3da605494ce 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -152,6 +152,8 @@ public:
OP_OMAP_SETKEYS = 32, // cid, attrset
OP_OMAP_RMKEYS = 33, // cid, keyset
OP_OMAP_SETHEADER = 34, // cid, header
+
+ OP_FALLOCATE = 35, // cid, oid, offset, length
};
private:
@@ -344,6 +346,15 @@ public:
::encode(len, tbl);
ops++;
}
+ void fallocate(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len) {
+ __u32 op = OP_FALLOCATE;
+ ::encode(op, tbl);
+ ::encode(cid, tbl);
+ ::encode(oid, tbl);
+ ::encode(off, tbl);
+ ::encode(len, tbl);
+ ops++;
+ }
void truncate(coll_t cid, const hobject_t& oid, uint64_t off) {
__u32 op = OP_TRUNCATE;
::encode(op, tbl);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 76ad5089493..c86a3d41a27 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -2424,6 +2424,24 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
break;
+
+ case CEPH_OSD_OP_PREALLOC:
+ {
+ if ((op.prealloc.flags & CEPH_OSD_PREALLOC_FLAG_ONCREATE) && obs.exists) {
+ // obj already exists, no-op
+ } else if ((op.prealloc.flags & CEPH_OSD_PREALLOC_FLAG_IFEXISTS) && !obs.exists) {
+ // obj does not exit, no-op
+ } else {
+ if (!obs.exists) {
+ ctx->delta_stats.num_objects++;
+ t.touch(coll, soid);
+ obs.exists = true;
+ }
+ t.fallocate(coll, soid, op.prealloc.offset, op.prealloc.length);
+ }
+ }
+ break;
+
case CEPH_OSD_OP_CREATE:
{
int flags = le32_to_cpu(op.flags);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 4a1b3fcf2ef..329b5d14eb3 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2747,6 +2747,9 @@ ostream& operator<<(ostream& out, const OSDOp& op)
case CEPH_OSD_OP_ROLLBACK:
out << " " << snapid_t(op.op.snap.snapid);
break;
+ case CEPH_OSD_OP_PREALLOC:
+ out << " " << op.op.prealloc.offset << "~" << op.op.prealloc.length << " flags " << op.op.prealloc.flags;
+ break;
default:
out << " " << op.op.extent.offset << "~" << op.op.extent.length;
if (op.op.extent.truncate_seq)
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 9a20849d574..3f4c8dde3fe 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -260,6 +260,12 @@ struct ObjectOperation {
bufferlist bl;
add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
}
+ void prealloc(uint64_t off, uint64_t len, uint64_t flags) {
+ OSDOp& osd_op = add_op(CEPH_OSD_OP_PREALLOC);
+ osd_op.op.prealloc.offset = off;
+ osd_op.op.prealloc.length = len;
+ osd_op.op.prealloc.flags = flags;
+ }
void clone_range(const object_t& src_oid, uint64_t src_offset, uint64_t len, uint64_t dst_offset) {
add_clone_range(CEPH_OSD_OP_CLONERANGE, dst_offset, len, src_oid, src_offset, CEPH_NOSNAP);